diff --git a/CMakeLists.txt b/CMakeLists.txt index ac4bde264..a93e6463e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,8 +29,8 @@ include(${PROJECT_SOURCE_DIR}/cmake/helpers.cmake) check_compiler_version() -#set default build types. -#Available build types are: Debug, Release, RelWithDebInfo and MinSizeRel +#set default build type +#available build types are: Debug, Release, RelWithDebInfo and MinSizeRel if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() @@ -46,7 +46,8 @@ endif() option(BUILD_EXAMPLES "Build examples" TRUE) option(BUILD_FT "Build functional tests" TRUE) -option(BUILD_UT "Build unit tests" TRUE) +option(BUILD_UT "Build unit tests" FALSE) +option(BUILD_CONFIG "Build cmake configs" TRUE) option(USE_CODECOV_FLAGS "Calculate code coverage" FALSE) option(WITH_ASAN "Use address sanitizer, can only be used in Debug build" FALSE) @@ -66,6 +67,7 @@ message(STATUS "CXX compiler : ${CMAKE_CXX_COMPILER}") message(STATUS "Build examples: ${BUILD_EXAMPLES}") message(STATUS "Build functional tests: ${BUILD_FT}") message(STATUS "Build unit tests: ${BUILD_UT}") +message(STATUS "Build cmake configs: ${BUILD_CONFIG}") add_definitions(-DCCL_C_COMPILER="${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") add_definitions(-DCCL_CXX_COMPILER="${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") @@ -89,17 +91,33 @@ set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/kernels") set(CCL_UNIT_TESTS_BUILD "${CMAKE_BINARY_DIR}/tests/unit") -set(MPI_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/mpi/include/") -set(MPI_LIB_DIR "${PROJECT_SOURCE_DIR}/mpi/lib/") + +# setup dependency directories + +set(DEPS_DIR "${PROJECT_SOURCE_DIR}/deps") + +set(MPI_INCLUDE_DIR "${DEPS_DIR}/mpi/include/") +set(MPI_LIB_DIR "${DEPS_DIR}/mpi/lib/") if ( "${LIBFABRIC_DIR}" STREQUAL "") - set(LIBFABRIC_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/ofi/include") - set(LIBFABRIC_LIB_DIR "${PROJECT_SOURCE_DIR}/ofi/lib/") + set(LIBFABRIC_INCLUDE_DIR "${DEPS_DIR}/ofi/include") + set(LIBFABRIC_LIB_DIR "${DEPS_DIR}/ofi/lib/") else() set(LIBFABRIC_INCLUDE_DIR "${LIBFABRIC_DIR}/include/") set(LIBFABRIC_LIB_DIR "${LIBFABRIC_DIR}/lib") endif() +set(HWLOC_INCLUDE_DIR "${DEPS_DIR}/hwloc/include/") +set(HWLOC_LIB_DIR "${DEPS_DIR}/hwloc/lib/") + +message(STATUS "MPI_INCLUDE_DIR: ${MPI_INCLUDE_DIR}") +message(STATUS "MPI_LIB_DIR: ${MPI_LIB_DIR}") +message(STATUS "LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}") +message(STATUS "LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}") +message(STATUS "HWLOC_INCLUDE_DIR: ${HWLOC_INCLUDE_DIR}") +message(STATUS "HWLOC_LIB_DIR: ${HWLOC_LIB_DIR}") + include_directories(${MPI_INCLUDE_DIR}) include_directories(${LIBFABRIC_INCLUDE_DIR}) + link_directories(${MPI_LIB_DIR}) link_directories(${LIBFABRIC_LIB_DIR}) @@ -114,26 +132,31 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1) set(C_COMPILER_FLAGS "-std=gnu99") endif() +# special flags for CCL library only +set(SRC_C_FLAGS "") +set(SRC_CXX_FLAGS "") +set(SRC_SHARED_LINKER_FLAGS "") + #common settings of security options if(USE_SECURITY_FLAGS) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fPIE -fPIC -z noexecstack -z relro -z now") + set(SRC_C_FLAGS "${SRC_C_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector") + set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector") + set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -fPIE -fPIC -z noexecstack -z relro -z now") if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstack-protector-strong") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong") + set(SRC_C_FLAGS "${SRC_C_FLAGS} -fstack-protector-strong") + set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -fstack-protector-strong") endif() endif() endif() -set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${PROJECT_SOURCE_DIR}/ccl.map") +set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -Wl,--version-script=${PROJECT_SOURCE_DIR}/ccl.map") if(${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") - if (USE_CODECOV_FLAGS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -prof-gen=srcpos -prof-src-root-cwd") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -prof-gen=srcpos -prof-src-root-cwd") - endif() + if (USE_CODECOV_FLAGS) + set(SRC_C_FLAGS "${SRC_C_FLAGS} -prof-gen=srcpos -prof-src-root-cwd") + set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -prof-gen=srcpos -prof-src-root-cwd") + endif() endif() #TODO: add -Wextra to c/cxx flags @@ -155,31 +178,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(TRY_ENABLE_SYCL_L0 ON) +set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/cmake) if (COMPUTE_BACKEND) - activate_compute_backend("${CMAKE_CURRENT_LIST_DIR}/cmake" ${COMPUTE_BACKEND}) - if (NOT COMPUTE_BACKEND_TARGET_NAME) - message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_BACKEND}") - endif() - message(STATUS "COMPUTE_BACKEND_TARGET_NAME: ${COMPUTE_BACKEND_TARGET_NAME}") - - if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero") - option (CCL_ENABLE_SYCL "Enable CCL SYCL runtime" ON) - message(STATUS "Enable CCL SYCL runtime") - execute_process(COMMAND dpcpp -v - OUTPUT_VARIABLE DPCPP_VERSION - ERROR_VARIABLE DPCPP_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE - ) - message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}") - if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "ze_loader") - set(MULTI_GPU_SUPPORT ON) - endif() - if (MULTI_GPU_SUPPORT) - message(STATUS "Enable multi GPU support using L0") - endif() + message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND}") + set_compute_backend(${COMMON_CMAKE_DIR}) endif() if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") @@ -189,6 +191,15 @@ if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL " endif() endif() +# Clang doesn't automatically detects ninja processes as supporting colored output +# due to the way they are spawned. In order to fix the issue we need to use the option +# to force colored output +if(${CMAKE_GENERATOR} STREQUAL "Ninja") + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + add_compile_options(-fcolor-diagnostics) + endif() +endif() + if(WITH_ASAN AND ${CMAKE_BUILD_TYPE_CASE_INSENSITIVE} STREQUAL "debug") message(STATUS "Compiling with address sanitizer") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer") @@ -203,7 +214,7 @@ set(CCL_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src) enable_testing() set(EXTERNAL_LIBS "") -set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${PROJECT_SOURCE_DIR}/mpi/include) +set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${MPI_INCLUDE_DIR}) # allow `deprecated` set(CMAKE_CLANG_FLAGS "${CMAKE_CLANG_FLAGS}") @@ -223,12 +234,14 @@ install(PROGRAMS ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CCL_INSTALL_LICENSE # copy kernels if(COMPUTE_BACKEND AND EXISTS "${PROJECT_SOURCE_DIR}/src/kernels") - file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/ring_*.spv") - install(PROGRAMS ${spv_kernels} DESTINATION ${CCL_INSTALL_KERNELS}) +file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/ring_*.spv") + install(PROGRAMS ${spv_kernels} + DESTINATION ${CCL_INSTALL_KERNELS} + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) endif() set(CCL_MAJOR_VERSION "2021") -set(CCL_MINOR_VERSION "2") +set(CCL_MINOR_VERSION "3") set(CCL_UPDATE_VERSION "0") set(CCL_PRODUCT_STATUS "Gold") string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ") @@ -238,16 +251,16 @@ configure_file(${PROJECT_SOURCE_DIR}/include/oneapi/ccl/config.h.in "${CMAKE_CUR file(COPY "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/config.h" DESTINATION ${PROJECT_SOURCE_DIR}/include/oneapi/ccl) include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) -#generate oneCCLConfig*.cmake -configure_file("cmake/templates/oneCCLConfig.cmake.in" - "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfig.cmake" - COPYONLY) - set(PROJECT_VERSION "${CCL_MAJOR_VERSION}.${CCL_MINOR_VERSION}.${CCL_UPDATE_VERSION}") -configure_file("cmake/templates/oneCCLConfigVersion.cmake.in" - "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfigVersion.cmake" - @ONLY) +if (BUILD_CONFIG) + configure_file("cmake/templates/oneCCLConfig.cmake.in" + "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfig.cmake" + COPYONLY) + configure_file("cmake/templates/oneCCLConfigVersion.cmake.in" + "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfigVersion.cmake" + @ONLY) +endif() #include other CMakeLists @@ -267,6 +280,6 @@ if (BUILD_FT) add_subdirectory(tests/functional) endif() -if (BUILD_UT) - #add_subdirectory(tests/unit) +if (BUILD_UT AND EXISTS "${PROJECT_SOURCE_DIR}/tests/unit") + add_subdirectory(tests/unit) endif() diff --git a/cmake/FindNUMA.cmake b/cmake/FindNUMA.cmake new file mode 100644 index 000000000..9c860354e --- /dev/null +++ b/cmake/FindNUMA.cmake @@ -0,0 +1,22 @@ +# Find the NUMA library and includes +# +# NUMA_INCLUDE_DIR - where to find numa.h +# NUMA_LIBRARIES - list of libraries when using NUMA +# NUMA_FOUND - true if NUMA found + +find_path(NUMA_INCLUDE_DIR + NAMES numa.h numaif.h + HINTS ${NUMA_ROOT_DIR}/include) + +find_library(NUMA_LIBRARIES + NAMES numa + HINTS ${NUMA_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR) + +if (NUMA_FOUND) + message(STATUS "NUMA was found, include_dir: ${NUMA_INCLUDE_DIR}, libraries: ${NUMA_LIBRARIES}") +else() + message(STATUS "NUMA was not found") +endif() diff --git a/cmake/ccl b/cmake/ccl index c96106735..474562428 100644 --- a/cmake/ccl +++ b/cmake/ccl @@ -34,14 +34,28 @@ set componentname "[file tail "$componentroot"]" # get oneAPI top-level root folder set oneapiroot "[file dirname "$componentroot"]" +# disallow loading multiple versions of this modulefile +# disallow loading multiple architectures of this modulefile +# if only 64-bit architecture exists the test still works +set mname32 $modulefilename +set mname64 [string trimright $mname32 "32"] +if { [string equal "$mname32" "$mname64"] } { + append mname32 "32" +} +conflict $mname32 +conflict $mname64 + + # On load print component name and version being loaded if { [ module-info mode load ] } { puts stderr "Loading $modulefilename" } -# On remove print component name and version being removed +# On `module unload` print component module name and version being removed +# Include `module list` message only if this modulefile loads dependent modules if { [ module-info mode ] == "unload" || [ module-info mode ] == "remove" } { puts stderr "Removing $modulefilename" + puts stderr "Use `module list` to view any remaining dependent modules." } diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index aa85955e5..b24acc18b 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -6,7 +6,7 @@ function(set_lp_env) set(GCC_BF16_AVX512BF_MIN_SUPPORTED "10.0.0") set(ICC_BF16_AVX512BF_MIN_SUPPORTED "19.1.0") set(CLANG_BF16_MIN_SUPPORTED "9.0.0") - set(CLANG_BF16_AVX512BF_MIN_SUPPORTED "10.0.0") + set(CLANG_BF16_AVX512BF_MIN_SUPPORTED "9.3.0") if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" @@ -16,11 +16,10 @@ function(set_lp_env) ) add_definitions(-DCCL_BF16_COMPILER) set(CCL_BF16_COMPILER ON) - message(STATUS "BF16 compiler: yes") else() set(CCL_BF16_COMPILER OFF) - message(STATUS "BF16 compiler: no") endif() + message(STATUS "BF16 compiler: ${CCL_BF16_COMPILER}") if ((${CMAKE_C_COMPILER_ID} STREQUAL "Intel" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_BF16_AVX512BF_MIN_SUPPORTED}) @@ -30,21 +29,27 @@ function(set_lp_env) AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_BF16_AVX512BF_MIN_SUPPORTED}) ) add_definitions(-DCCL_BF16_AVX512BF_COMPILER) - message(STATUS "BF16 AVX512BF compiler: yes") + set(CCL_BF16_AVX512BF_COMPILER ON) else() - message(STATUS "BF16 AVX512BF compiler: no") + set(CCL_BF16_AVX512BF_COMPILER OFF) endif() + message(STATUS "BF16 AVX512BF compiler: ${CCL_BF16_AVX512BF_COMPILER}") if (CCL_BF16_COMPILER) if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU")) add_definitions(-DCCL_BF16_TARGET_ATTRIBUTES) - message(STATUS "BF16 target attributes: yes") + set(CCL_BF16_TARGET_ATTRIBUTES ON) else() - message(STATUS "BF16 target attributes: no") + set(CCL_BF16_TARGET_ATTRIBUTES OFF) endif() + message(STATUS "BF16 target attributes: ${CCL_BF16_TARGET_ATTRIBUTES}") endif() - set(CCL_GPU_BF16_TRUNCATE ON PARENT_SCOPE) + option(CCL_BF16_GPU_TRUNCATE "Truncate BF16 in GPU operations" ON) + if (CCL_BF16_GPU_TRUNCATE) + add_definitions(-DCCL_BF16_GPU_TRUNCATE) + endif() + message(STATUS "BF16 GPU truncate: ${CCL_BF16_GPU_TRUNCATE}") set(GCC_FP16_MIN_SUPPORTED "4.9.0") @@ -58,30 +63,36 @@ function(set_lp_env) ) add_definitions(-DCCL_FP16_COMPILER) set(CCL_FP16_COMPILER ON) - message(STATUS "FP16 compiler: yes") else() set(CCL_FP16_COMPILER OFF) - message(STATUS "FP16 compiler: no") endif() + message(STATUS "FP16 compiler: ${CCL_FP16_COMPILER}") if (CCL_FP16_COMPILER) if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU")) add_definitions(-DCCL_FP16_TARGET_ATTRIBUTES) - message(STATUS "FP16 target attributes: yes") + set(CCL_FP16_TARGET_ATTRIBUTES ON) else() - message(STATUS "FP16 target attributes: no") + set(CCL_FP16_TARGET_ATTRIBUTES OFF) endif() + message(STATUS "FP16 target attributes: ${CCL_FP16_TARGET_ATTRIBUTES}") + endif() + + option(CCL_FP16_GPU_TRUNCATE "Truncate FP16 in GPU operations" OFF) + if (CCL_FP16_GPU_TRUNCATE) + add_definitions(-DCCL_FP16_GPU_TRUNCATE) endif() + message(STATUS "FP16 GPU truncate: ${CCL_FP16_GPU_TRUNCATE}") set(LP_ENV_DEFINED 1 PARENT_SCOPE) endfunction(set_lp_env) - function(check_compiler_version) set(GCC_MIN_SUPPORTED "4.8") set(ICC_MIN_SUPPORTED "15.0") + set(CLANG_MIN_SUPPORTED "9.0") if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU") if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_MIN_SUPPORTED}) @@ -91,6 +102,10 @@ function(check_compiler_version) if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_MIN_SUPPORTED}) message(FATAL_ERROR "icc min supported version is ${ICC_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}") endif() + elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_MIN_SUPPORTED}) + message(FATAL_ERROR "clang min supported version is ${CLANG_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}") + endif() else() message(WARNING "Compilation with ${CMAKE_C_COMPILER_ID} was not tested, no warranty") endif() @@ -121,7 +136,6 @@ endfunction(get_vcs_properties) function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND) - string( TOLOWER "${COMPUTE_BACKEND}" COMPUTE_BACKEND) set(CCL_ENABLE_SYCL_V 0 PARENT_SCOPE) @@ -176,7 +190,7 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND) # remember current target for `target_link_libraries` in ccl set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero) set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero PARENT_SCOPE) - message ("___COMPUTE_BACKEND_TARGET_NAME=${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider") + message ("COMPUTE_BACKEND_TARGET_NAME=${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider") elseif(COMPUTE_BACKEND STREQUAL "level_zero") SET (COMPUTE_BACKEND_LOAD_MODULE "level_zero" @@ -219,7 +233,7 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND) set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL) set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL PARENT_SCOPE) # elseif(COMPUTE_BACKEND STREQUAL "host") - # message ("COMPUTE_BACKEND=${COMPUTE_BACKEND} requested.") + # message ("COMPUTE_BACKEND=${COMPUTE_BACKEND} requested.") # else() # message(FATAL_ERROR "Please provide one of the following compute runtime: dpcpp, level_zero, dpcpp_level_zero, host") endif() @@ -230,6 +244,14 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND) get_target_property(COMPUTE_BACKEND_LIBRARIES_LOCAL ${COMPUTE_BACKEND_TARGET_NAME} INTERFACE_LINK_LIBRARIES) + # When we use dpcpp compiler(dpcpp/dpcpp_level_zero backends), use c++17 to be aligned with compiler + if (${COMPUTE_BACKEND_TARGET_NAME} MATCHES "^Intel::SYCL.*") + set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE) + # And use c++11 for all other cases + else() + set(CMAKE_CXX_STANDARD 11 PARENT_SCOPE) + endif() + # set output variables in the parent scope: # Only `COMPUTE_BACKEND_FLAGS` is actually required, because the other flags are derived from # 'target_link_libraries'. @@ -239,3 +261,48 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND) set(COMPUTE_BACKEND_INCLUDE_DIRS ${COMPUTE_BACKEND_INCLUDE_DIRS_LOCAL} PARENT_SCOPE) endfunction(activate_compute_backend) + +function(set_compute_backend COMMON_CMAKE_DIR) + activate_compute_backend("${COMMON_CMAKE_DIR}" ${COMPUTE_BACKEND}) + + # When we use dpcpp compiler(dpcpp/dpcpp_level_zero backends), use c++17 to be aligned with compiler + # Although the same thing is done in activate_compute_backend we need to set the variable here as + # well bacause both set_compute_backend and activate_compute_backend can be called directly + if (${COMPUTE_BACKEND_TARGET_NAME} MATCHES "^Intel::SYCL.*") + set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE) + # And use c++11 for all other cases + else() + set(CMAKE_CXX_STANDARD 11 PARENT_SCOPE) + endif() + + if (NOT COMPUTE_BACKEND_TARGET_NAME) + message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_BACKEND} in ${COMMON_CMAKE_DIR}") + endif() + message(STATUS "COMPUTE_BACKEND_TARGET_NAME: ${COMPUTE_BACKEND_TARGET_NAME}") + + if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero") + option (CCL_ENABLE_SYCL "Enable CCL SYCL runtime" ON) + message(STATUS "Enable CCL SYCL runtime") + execute_process(COMMAND dpcpp -v + OUTPUT_VARIABLE DPCPP_VERSION + ERROR_VARIABLE DPCPP_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE + ) + message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}") + if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "ze_loader") + set(MULTI_GPU_SUPPORT ON PARENT_SCOPE) + set(MULTI_GPU_SUPPORT ON) + endif() + if (MULTI_GPU_SUPPORT) + message(STATUS "Enable multi GPU support using L0") + endif() + + # need to pass these variables to overlying function + set (COMPUTE_BACKEND_TARGET_NAME ${COMPUTE_BACKEND_TARGET_NAME} PARENT_SCOPE) + set (COMPUTE_BACKEND_FLAGS ${COMPUTE_BACKEND_FLAGS} PARENT_SCOPE) + set (COMPUTE_BACKEND_LIBRARIES ${COMPUTE_BACKEND_LIBRARIES} PARENT_SCOPE) + set (COMPUTE_BACKEND_FLAGS ${COMPUTE_BACKEND_FLAGS} PARENT_SCOPE) +endfunction(set_compute_backend) diff --git a/cmake/setvars.sh.in b/cmake/setvars.sh.in index 7a610a2f6..f061e4f9d 100644 --- a/cmake/setvars.sh.in +++ b/cmake/setvars.sh.in @@ -16,11 +16,8 @@ # WORK_DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null && pwd)" -export CCL_ROOT="$(cd ${WORK_DIR}/../; pwd -P)" -if [ -z "${I_MPI_ROOT}" ] -then - export I_MPI_ROOT="${CCL_ROOT}" -fi +CCL_ROOT="$(cd ${WORK_DIR}/../; pwd -P)" +export I_MPI_ROOT="${CCL_ROOT}" source ${CCL_ROOT}/env/vars.sh $1 @@ -31,4 +28,4 @@ else PATH="${CCL_ROOT}/bin:${PATH}"; export PATH fi -FI_PROVIDER_PATH="${CCL_ROOT}/@CMAKE_INSTALL_LIBDIR@/prov"; export FI_PROVIDER_PATH +FI_PROVIDER_PATH="${CCL_ROOT}/@CMAKE_INSTALL_LIBDIR@/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH diff --git a/cmake/templates/oneCCLConfig.cmake.in b/cmake/templates/oneCCLConfig.cmake.in index dd2b988ee..86b7de9f8 100644 --- a/cmake/templates/oneCCLConfig.cmake.in +++ b/cmake/templates/oneCCLConfig.cmake.in @@ -1,4 +1,3 @@ -# Default installation path: <oneccl_root>/lib/cmake/oneCCL/ # # Copyright 2016-2020 Intel Corporation # @@ -14,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +# Default installation path: <oneccl_root>/lib/cmake/oneCCL/ get_filename_component(_oneccl_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH) get_filename_component(_oneccl_root "${_oneccl_root}/../../../" ABSOLUTE) @@ -33,7 +32,7 @@ if (_oneccl_subdir EQUAL "cpu_icc") endif() get_filename_component(_oneccl_headers "${_oneccl_root}/include/${_oneccl_subdir}" ABSOLUTE) -get_filename_component(_oneccl_lib "${_oneccl_root}/lib/${_oneccl_subdir}" ABSOLUTE) +get_filename_component(_oneccl_lib "${_oneccl_root}/lib/${_oneccl_subdir}/libccl.so" ABSOLUTE) if (EXISTS "${_oneccl_headers}" AND EXISTS "${_oneccl_lib}") if (NOT TARGET oneCCL) diff --git a/cmake/templates/oneCCLConfigVersion.cmake.in b/cmake/templates/oneCCLConfigVersion.cmake.in index 89d23cefd..d571a725c 100644 --- a/cmake/templates/oneCCLConfigVersion.cmake.in +++ b/cmake/templates/oneCCLConfigVersion.cmake.in @@ -1,4 +1,3 @@ -set(PACKAGE_VERSION @PROJECT_VERSION@) # # Copyright 2016-2020 Intel Corporation # @@ -14,13 +13,14 @@ set(PACKAGE_VERSION @PROJECT_VERSION@) # See the License for the specific language governing permissions and # limitations under the License. # - -if ("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") - set(PACKAGE_VERSION_COMPATIBLE FALSE) -else() - set(PACKAGE_VERSION_COMPATIBLE TRUE) - if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") - set(PACKAGE_VERSION_EXACT TRUE) - endif() -endif() - +set(PACKAGE_VERSION @PROJECT_VERSION@) + +if ("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_COMPATIBLE FALSE) +else() + set(PACKAGE_VERSION_COMPATIBLE TRUE) + if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") + set(PACKAGE_VERSION_EXACT TRUE) + endif() +endif() + diff --git a/cmake/vars.sh.in b/cmake/vars.sh.in index e48ec98ec..5e47752ed 100644 --- a/cmake/vars.sh.in +++ b/cmake/vars.sh.in @@ -52,6 +52,17 @@ if [ "$_vars_this_script_name" = "$(_vars_get_proc_name "$0")" ] ; then return 255 2>/dev/null || exit 255 fi +prepend_path() ( + path_to_add="$1" + path_is_now="$2" + + if [ "" = "${path_is_now}" ] ; then # avoid dangling ":" + printf "%s" "${path_to_add}" + else + printf "%s" "${path_to_add}:${path_is_now}" + fi +) + vars_script_name="" vars_script_shell="$(ps -p "$$" -o comm=)" @@ -87,6 +98,7 @@ if [ "" = "$vars_script_name" ] ; then >&2 echo " ERROR: Unable to proceed: possible causes listed below." >&2 echo " This script must be sourced. Did you execute or source this script?" ; >&2 echo " Unrecognized/unsupported shell (supported: bash, zsh, ksh, m/lksh, dash)." ; + >&2 echo " May fail in dash if you rename this script (assumes \"vars.sh\")." ; >&2 echo " Can be caused by sourcing from ZSH version 4.x or older." ; return 255 2>/dev/null || exit 255 fi @@ -95,6 +107,6 @@ WORK_DIR=$(get_script_path "${vars_script_name:-}") CCL_ROOT="$(cd "${WORK_DIR}"/../; pwd -P)"; export CCL_ROOT -CPATH="${CCL_ROOT}/include/${CPATH+:${CPATH}}"; export CPATH -LIBRARY_PATH="${CCL_ROOT}/lib/${LIBRARY_PATH+:${LIBRARY_PATH}}"; export LIBRARY_PATH -LD_LIBRARY_PATH="${CCL_ROOT}/lib/${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}"; export LD_LIBRARY_PATH +CPATH=$(prepend_path "${CCL_ROOT}/include" "${CPATH:-}"); export CPATH +LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH +LD_LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LD_LIBRARY_PATH:-}"); export LD_LIBRARY_PATH diff --git a/deps/hwloc/include/hwloc.h b/deps/hwloc/include/hwloc.h new file mode 100644 index 000000000..6dd1159f7 --- /dev/null +++ b/deps/hwloc/include/hwloc.h @@ -0,0 +1,2479 @@ +/* + * Copyright © 2009 CNRS + * Copyright © 2009-2021 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2020 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/*===================================================================== + * PLEASE GO READ THE DOCUMENTATION! + * ------------------------------------------------ + * $tarball_directory/doc/doxygen-doc/ + * or + * https://www.open-mpi.org/projects/hwloc/doc/ + *===================================================================== + * + * FAIR WARNING: Do NOT expect to be able to figure out all the + * subtleties of hwloc by simply reading function prototypes and + * constant descrptions here in this file. + * + * Hwloc has wonderful documentation in both PDF and HTML formats for + * your reading pleasure. The formal documentation explains a LOT of + * hwloc-specific concepts, provides definitions, and discusses the + * "big picture" for many of the things that you'll find here in this + * header file. + * + * The PDF/HTML documentation was generated via Doxygen; much of what + * you'll see in there is also here in this file. BUT THERE IS A LOT + * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h! + * + * There are entire paragraph-length descriptions, discussions, and + * pretty prictures to explain subtle corner cases, provide concrete + * examples, etc. + * + * Please, go read the documentation. :-) + * + * Moreover there are several examples of hwloc use under doc/examples + * in the source tree. + * + *=====================================================================*/ + +/** \file + * \brief The hwloc API. + * + * See hwloc/bitmap.h for bitmap specific macros. + * See hwloc/helper.h for high-level topology traversal helpers. + * See hwloc/inlines.h for the actual inline code of some functions below. + * See hwloc/export.h for exporting topologies to XML or to synthetic descriptions. + * See hwloc/distances.h for querying and modifying distances between objects. + * See hwloc/diff.h for manipulating differences between similar topologies. + */ + +#ifndef HWLOC_H +#define HWLOC_H + +#include "hwloc/autogen/config.h" + +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> + +/* + * Symbol transforms + */ +#include "hwloc/rename.h" + +/* + * Bitmap definitions + */ + +#include "hwloc/bitmap.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/** \defgroup hwlocality_api_version API version + * @{ + */ + +/** \brief Indicate at build time which hwloc API version is being used. + * + * This number is updated to (X<<16)+(Y<<8)+Z when a new release X.Y.Z + * actually modifies the API. + * + * Users may check for available features at build time using this number + * (see \ref faq_version_api). + * + * \note This should not be confused with HWLOC_VERSION, the library version. + * Two stable releases of the same series usually have the same ::HWLOC_API_VERSION + * even if their HWLOC_VERSION are different. + */ +#define HWLOC_API_VERSION 0x00020500 + +/** \brief Indicate at runtime which hwloc API version was used at build time. + * + * Should be ::HWLOC_API_VERSION if running on the same version. + */ +HWLOC_DECLSPEC unsigned hwloc_get_api_version(void); + +/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */ +#define HWLOC_COMPONENT_ABI 7 + +/** @} */ + + + +/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t) + * + * Hwloc uses bitmaps to represent two distinct kinds of object sets: + * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t). + * These types are both typedefs to a common back end type + * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions + * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see + * \ref hwlocality_bitmap). + * + * The rationale for having two different types is that even though + * the actions one wants to perform on these types are the same (e.g., + * enable and disable individual items in the set/mask), they're used + * in very different contexts: one for specifying which processors to + * use and one for specifying which NUMA nodes to use. Hence, the + * name difference is really just to reflect the intent of where the + * type is used. + * + * @{ + */ + +/** \brief A CPU set is a bitmap whose bits are set according to CPU + * physical OS indexes. + * + * It may be consulted and modified with the bitmap API as any + * ::hwloc_bitmap_t (see hwloc/bitmap.h). + * + * Each bit may be converted into a PU object using + * hwloc_get_pu_obj_by_os_index(). + */ +typedef hwloc_bitmap_t hwloc_cpuset_t; +/** \brief A non-modifiable ::hwloc_cpuset_t. */ +typedef hwloc_const_bitmap_t hwloc_const_cpuset_t; + +/** \brief A node set is a bitmap whose bits are set according to NUMA + * memory node physical OS indexes. + * + * It may be consulted and modified with the bitmap API as any + * ::hwloc_bitmap_t (see hwloc/bitmap.h). + * Each bit may be converted into a NUMA node object using + * hwloc_get_numanode_obj_by_os_index(). + * + * When binding memory on a system without any NUMA node, + * the single main memory bank is considered as NUMA node #0. + * + * See also \ref hwlocality_helper_nodeset_convert. + */ +typedef hwloc_bitmap_t hwloc_nodeset_t; +/** \brief A non-modifiable ::hwloc_nodeset_t. + */ +typedef hwloc_const_bitmap_t hwloc_const_nodeset_t; + +/** @} */ + + + +/** \defgroup hwlocality_object_types Object Types + * @{ + */ + +/** \brief Type of topology object. + * + * \note Do not rely on the ordering or completeness of the values as new ones + * may be defined in the future! If you need to compare types, use + * hwloc_compare_types() instead. + */ +typedef enum { + +/** \cond */ +#define HWLOC_OBJ_TYPE_MIN HWLOC_OBJ_MACHINE /* Sentinel value */ +/** \endcond */ + + HWLOC_OBJ_MACHINE, /**< \brief Machine. + * A set of processors and memory with cache + * coherency. + * + * This type is always used for the root object of a topology, + * and never used anywhere else. + * Hence its parent is always \c NULL. + */ + + HWLOC_OBJ_PACKAGE, /**< \brief Physical package. + * The physical package that usually gets inserted + * into a socket on the motherboard. + * A processor package usually contains multiple cores, + * and possibly some dies. + */ + HWLOC_OBJ_CORE, /**< \brief Core. + * A computation unit (may be shared by several + * PUs, aka logical processors). + */ + HWLOC_OBJ_PU, /**< \brief Processing Unit, or (Logical) Processor. + * An execution unit (may share a core with some + * other logical processors, e.g. in the case of + * an SMT core). + * + * This is the smallest object representing CPU resources, + * it cannot have any child except Misc objects. + * + * Objects of this kind are always reported and can + * thus be used as fallback when others are not. + */ + + HWLOC_OBJ_L1CACHE, /**< \brief Level 1 Data (or Unified) Cache. */ + HWLOC_OBJ_L2CACHE, /**< \brief Level 2 Data (or Unified) Cache. */ + HWLOC_OBJ_L3CACHE, /**< \brief Level 3 Data (or Unified) Cache. */ + HWLOC_OBJ_L4CACHE, /**< \brief Level 4 Data (or Unified) Cache. */ + HWLOC_OBJ_L5CACHE, /**< \brief Level 5 Data (or Unified) Cache. */ + + HWLOC_OBJ_L1ICACHE, /**< \brief Level 1 instruction Cache (filtered out by default). */ + HWLOC_OBJ_L2ICACHE, /**< \brief Level 2 instruction Cache (filtered out by default). */ + HWLOC_OBJ_L3ICACHE, /**< \brief Level 3 instruction Cache (filtered out by default). */ + + HWLOC_OBJ_GROUP, /**< \brief Group objects. + * Objects which do not fit in the above but are + * detected by hwloc and are useful to take into + * account for affinity. For instance, some operating systems + * expose their arbitrary processors aggregation this + * way. And hwloc may insert such objects to group + * NUMA nodes according to their distances. + * See also \ref faq_groups. + * + * These objects are removed when they do not bring + * any structure (see ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE). + */ + + HWLOC_OBJ_NUMANODE, /**< \brief NUMA node. + * An object that contains memory that is directly + * and byte-accessible to the host processors. + * It is usually close to some cores (the corresponding objects + * are descendants of the NUMA node object in the hwloc tree). + * + * This is the smallest object representing Memory resources, + * it cannot have any child except Misc objects. + * However it may have Memory-side cache parents. + * + * There is always at least one such object in the topology + * even if the machine is not NUMA. + * + * Memory objects are not listed in the main children list, + * but rather in the dedicated Memory children list. + * + * NUMA nodes have a special depth ::HWLOC_TYPE_DEPTH_NUMANODE + * instead of a normal depth just like other objects in the + * main tree. + */ + + HWLOC_OBJ_BRIDGE, /**< \brief Bridge (filtered out by default). + * Any bridge (or PCI switch) that connects the host or an I/O bus, + * to another I/O bus. + * + * Bridges are not added to the topology unless their + * filtering is changed (see hwloc_topology_set_type_filter() + * and hwloc_topology_set_io_types_filter()). + * + * I/O objects are not listed in the main children list, + * but rather in the dedicated io children list. + * I/O objects have NULL CPU and node sets. + */ + HWLOC_OBJ_PCI_DEVICE, /**< \brief PCI device (filtered out by default). + * + * PCI devices are not added to the topology unless their + * filtering is changed (see hwloc_topology_set_type_filter() + * and hwloc_topology_set_io_types_filter()). + * + * I/O objects are not listed in the main children list, + * but rather in the dedicated io children list. + * I/O objects have NULL CPU and node sets. + */ + HWLOC_OBJ_OS_DEVICE, /**< \brief Operating system device (filtered out by default). + * + * OS devices are not added to the topology unless their + * filtering is changed (see hwloc_topology_set_type_filter() + * and hwloc_topology_set_io_types_filter()). + * + * I/O objects are not listed in the main children list, + * but rather in the dedicated io children list. + * I/O objects have NULL CPU and node sets. + */ + + HWLOC_OBJ_MISC, /**< \brief Miscellaneous objects (filtered out by default). + * Objects without particular meaning, that can e.g. be + * added by the application for its own use, or by hwloc + * for miscellaneous objects such as MemoryModule (DIMMs). + * + * They are not added to the topology unless their filtering + * is changed (see hwloc_topology_set_type_filter()). + * + * These objects are not listed in the main children list, + * but rather in the dedicated misc children list. + * Misc objects may only have Misc objects as children, + * and those are in the dedicated misc children list as well. + * Misc objects have NULL CPU and node sets. + */ + + HWLOC_OBJ_MEMCACHE, /**< \brief Memory-side cache (filtered out by default). + * A cache in front of a specific NUMA node. + * + * This object always has at least one NUMA node as a memory child. + * + * Memory objects are not listed in the main children list, + * but rather in the dedicated Memory children list. + * + * Memory-side cache have a special depth ::HWLOC_TYPE_DEPTH_MEMCACHE + * instead of a normal depth just like other objects in the + * main tree. + */ + + HWLOC_OBJ_DIE, /**< \brief Die within a physical package. + * A subpart of the physical package, that contains multiple cores. + */ + + HWLOC_OBJ_TYPE_MAX /**< \private Sentinel value */ +} hwloc_obj_type_t; + +/** \brief Cache type. */ +typedef enum hwloc_obj_cache_type_e { + HWLOC_OBJ_CACHE_UNIFIED, /**< \brief Unified cache. */ + HWLOC_OBJ_CACHE_DATA, /**< \brief Data cache. */ + HWLOC_OBJ_CACHE_INSTRUCTION /**< \brief Instruction cache (filtered out by default). */ +} hwloc_obj_cache_type_t; + +/** \brief Type of one side (upstream or downstream) of an I/O bridge. */ +typedef enum hwloc_obj_bridge_type_e { + HWLOC_OBJ_BRIDGE_HOST, /**< \brief Host-side of a bridge, only possible upstream. */ + HWLOC_OBJ_BRIDGE_PCI /**< \brief PCI-side of a bridge. */ +} hwloc_obj_bridge_type_t; + +/** \brief Type of a OS device. */ +typedef enum hwloc_obj_osdev_type_e { + HWLOC_OBJ_OSDEV_BLOCK, /**< \brief Operating system block device, or non-volatile memory device. + * For instance "sda" or "dax2.0" on Linux. */ + HWLOC_OBJ_OSDEV_GPU, /**< \brief Operating system GPU device. + * For instance ":0.0" for a GL display, + * "card0" for a Linux DRM device. */ + HWLOC_OBJ_OSDEV_NETWORK, /**< \brief Operating system network device. + * For instance the "eth0" interface on Linux. */ + HWLOC_OBJ_OSDEV_OPENFABRICS, /**< \brief Operating system openfabrics device. + * For instance the "mlx4_0" InfiniBand HCA, + * or "hfi1_0" Omni-Path interface on Linux. */ + HWLOC_OBJ_OSDEV_DMA, /**< \brief Operating system dma engine device. + * For instance the "dma0chan0" DMA channel on Linux. */ + HWLOC_OBJ_OSDEV_COPROC /**< \brief Operating system co-processor device. + * For instance "opencl0d0" for a OpenCL device, + * "cuda0" for a CUDA device. */ +} hwloc_obj_osdev_type_t; + +/** \brief Compare the depth of two object types + * + * Types shouldn't be compared as they are, since newer ones may be added in + * the future. This function returns less than, equal to, or greater than zero + * respectively if \p type1 objects usually include \p type2 objects, are the + * same as \p type2 objects, or are included in \p type2 objects. If the types + * can not be compared (because neither is usually contained in the other), + * ::HWLOC_TYPE_UNORDERED is returned. Object types containing CPUs can always + * be compared (usually, a system contains machines which contain nodes which + * contain packages which contain caches, which contain cores, which contain + * processors). + * + * \note ::HWLOC_OBJ_PU will always be the deepest, + * while ::HWLOC_OBJ_MACHINE is always the highest. + * + * \note This does not mean that the actual topology will respect that order: + * e.g. as of today cores may also contain caches, and packages may also contain + * nodes. This is thus just to be seen as a fallback comparison method. + */ +HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const; + +/** \brief Value returned by hwloc_compare_types() when types can not be compared. \hideinitializer */ +#define HWLOC_TYPE_UNORDERED INT_MAX + +/** @} */ + + + +/** \defgroup hwlocality_objects Object Structure and Attributes + * @{ + */ + +union hwloc_obj_attr_u; + +/** \brief Structure of a topology object + * + * Applications must not modify any field except \p hwloc_obj.userdata. + */ +struct hwloc_obj { + /* physical information */ + hwloc_obj_type_t type; /**< \brief Type of object */ + char *subtype; /**< \brief Subtype string to better describe the type field. */ + + unsigned os_index; /**< \brief OS-provided physical index number. + * It is not guaranteed unique across the entire machine, + * except for PUs and NUMA nodes. + * Set to HWLOC_UNKNOWN_INDEX if unknown or irrelevant for this object. + */ +#define HWLOC_UNKNOWN_INDEX (unsigned)-1 + + char *name; /**< \brief Object-specific name if any. + * Mostly used for identifying OS devices and Misc objects where + * a name string is more useful than numerical indexes. + */ + + hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in NUMA nodes below this object. */ + + union hwloc_obj_attr_u *attr; /**< \brief Object type-specific Attributes, + * may be \c NULL if no attribute value was found */ + + /* global position */ + int depth; /**< \brief Vertical index in the hierarchy. + * + * For normal objects, this is the depth of the horizontal level + * that contains this object and its cousins of the same type. + * If the topology is symmetric, this is equal to the parent depth + * plus one, and also equal to the number of parent/child links + * from the root object to here. + * + * For special objects (NUMA nodes, I/O and Misc) that are not + * in the main tree, this is a special negative value that + * corresponds to their dedicated level, + * see hwloc_get_type_depth() and ::hwloc_get_type_depth_e. + * Those special values can be passed to hwloc functions such + * hwloc_get_nbobjs_by_depth() as usual. + */ + unsigned logical_index; /**< \brief Horizontal index in the whole list of similar objects, + * hence guaranteed unique across the entire machine. + * Could be a "cousin_rank" since it's the rank within the "cousin" list below + * Note that this index may change when restricting the topology + * or when inserting a group. + */ + + /* cousins are all objects of the same type (and depth) across the entire topology */ + struct hwloc_obj *next_cousin; /**< \brief Next object of same type and depth */ + struct hwloc_obj *prev_cousin; /**< \brief Previous object of same type and depth */ + + /* children of the same parent are siblings, even if they may have different type and depth */ + struct hwloc_obj *parent; /**< \brief Parent, \c NULL if root (Machine object) */ + unsigned sibling_rank; /**< \brief Index in parent's \c children[] array. Or the index in parent's Memory, I/O or Misc children list. */ + struct hwloc_obj *next_sibling; /**< \brief Next object below the same parent (inside the same list of children). */ + struct hwloc_obj *prev_sibling; /**< \brief Previous object below the same parent (inside the same list of children). */ + /** @name List and array of normal children below this object (except Memory, I/O and Misc children). */ + /**@{*/ + unsigned arity; /**< \brief Number of normal children. + * Memory, Misc and I/O children are not listed here + * but rather in their dedicated children list. + */ + struct hwloc_obj **children; /**< \brief Normal children, \c children[0 .. arity -1] */ + struct hwloc_obj *first_child; /**< \brief First normal child */ + struct hwloc_obj *last_child; /**< \brief Last normal child */ + /**@}*/ + + int symmetric_subtree; /**< \brief Set if the subtree of normal objects below this object is symmetric, + * which means all normal children and their children have identical subtrees. + * + * Memory, I/O and Misc children are ignored. + * + * If set in the topology root object, lstopo may export the topology + * as a synthetic string. + */ + + /** @name List of Memory children below this object. */ + /**@{*/ + unsigned memory_arity; /**< \brief Number of Memory children. + * These children are listed in \p memory_first_child. + */ + struct hwloc_obj *memory_first_child; /**< \brief First Memory child. + * NUMA nodes and Memory-side caches are listed here + * (\p memory_arity and \p memory_first_child) + * instead of in the normal children list. + * See also hwloc_obj_type_is_memory(). + * + * A memory hierarchy starts from a normal CPU-side object + * (e.g. Package) and ends with NUMA nodes as leaves. + * There might exist some memory-side caches between them + * in the middle of the memory subtree. + */ + /**@}*/ + + /** @name List of I/O children below this object. */ + /**@{*/ + unsigned io_arity; /**< \brief Number of I/O children. + * These children are listed in \p io_first_child. + */ + struct hwloc_obj *io_first_child; /**< \brief First I/O child. + * Bridges, PCI and OS devices are listed here (\p io_arity and \p io_first_child) + * instead of in the normal children list. + * See also hwloc_obj_type_is_io(). + */ + /**@}*/ + + /** @name List of Misc children below this object. */ + /**@{*/ + unsigned misc_arity; /**< \brief Number of Misc children. + * These children are listed in \p misc_first_child. + */ + struct hwloc_obj *misc_first_child; /**< \brief First Misc child. + * Misc objects are listed here (\p misc_arity and \p misc_first_child) + * instead of in the normal children list. + */ + /**@}*/ + + /* cpusets and nodesets */ + hwloc_cpuset_t cpuset; /**< \brief CPUs covered by this object + * + * This is the set of CPUs for which there are PU objects in the topology + * under this object, i.e. which are known to be physically contained in this + * object and known how (the children path between this object and the PU + * objects). + * + * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set, + * some of these CPUs may not be allowed for binding, + * see hwloc_topology_get_allowed_cpuset(). + * + * \note All objects have non-NULL CPU and node sets except Misc and I/O objects. + * + * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead. + */ + hwloc_cpuset_t complete_cpuset; /**< \brief The complete CPU set of processors of this object, + * + * This may include not only the same as the cpuset field, but also some CPUs for + * which topology information is unknown or incomplete, some offlines CPUs, and + * the CPUs that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED flag + * is not set. + * Thus no corresponding PU object may be found in the topology, because the + * precise position is undefined. It is however known that it would be somewhere + * under this object. + * + * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead. + */ + + hwloc_nodeset_t nodeset; /**< \brief NUMA nodes covered by this object or containing this object + * + * This is the set of NUMA nodes for which there are NUMA node objects in the + * topology under or above this object, i.e. which are known to be physically + * contained in this object or containing it and known how (the children path + * between this object and the NUMA node objects). + * + * In the end, these nodes are those that are close to the current object. + * Function hwloc_get_local_numanode_objs() may be used to list those NUMA + * nodes more precisely. + * + * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set, + * some of these nodes may not be allowed for allocation, + * see hwloc_topology_get_allowed_nodeset(). + * + * If there are no NUMA nodes in the machine, all the memory is close to this + * object, so only the first bit may be set in \p nodeset. + * + * \note All objects have non-NULL CPU and node sets except Misc and I/O objects. + * + * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead. + */ + hwloc_nodeset_t complete_nodeset; /**< \brief The complete NUMA node set of this object, + * + * This may include not only the same as the nodeset field, but also some NUMA + * nodes for which topology information is unknown or incomplete, some offlines + * nodes, and the nodes that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED + * flag is not set. + * Thus no corresponding NUMA node object may be found in the topology, because the + * precise position is undefined. It is however known that it would be + * somewhere under this object. + * + * If there are no NUMA nodes in the machine, all the memory is close to this + * object, so only the first bit is set in \p complete_nodeset. + * + * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead. + */ + + struct hwloc_info_s *infos; /**< \brief Array of stringified info type=name. */ + unsigned infos_count; /**< \brief Size of infos array. */ + + /* misc */ + void *userdata; /**< \brief Application-given private data pointer, + * initialized to \c NULL, use it as you wish. + * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h + * if you wish to export this field to XML. */ + + hwloc_uint64_t gp_index; /**< \brief Global persistent index. + * Generated by hwloc, unique across the topology (contrary to os_index) + * and persistent across topology changes (contrary to logical_index). + * Mostly used internally, but could also be used by application to identify objects. + */ +}; +/** + * \brief Convenience typedef; a pointer to a struct hwloc_obj. + */ +typedef struct hwloc_obj * hwloc_obj_t; + +/** \brief Object type-specific Attributes */ +union hwloc_obj_attr_u { + /** \brief NUMA node-specific Object Attributes */ + struct hwloc_numanode_attr_s { + hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */ + unsigned page_types_len; /**< \brief Size of array \p page_types */ + /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0. + * + * The array is sorted by increasing \p size fields. + * It contains \p page_types_len slots. + */ + struct hwloc_memory_page_type_s { + hwloc_uint64_t size; /**< \brief Size of pages */ + hwloc_uint64_t count; /**< \brief Number of pages of this size */ + } * page_types; + } numanode; + + /** \brief Cache-specific Object Attributes */ + struct hwloc_cache_attr_s { + hwloc_uint64_t size; /**< \brief Size of cache in bytes */ + unsigned depth; /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */ + unsigned linesize; /**< \brief Cache-line size in bytes. 0 if unknown */ + int associativity; /**< \brief Ways of associativity, + * -1 if fully associative, 0 if unknown */ + hwloc_obj_cache_type_t type; /**< \brief Cache type */ + } cache; + /** \brief Group-specific Object Attributes */ + struct hwloc_group_attr_s { + unsigned depth; /**< \brief Depth of group object. + * It may change if intermediate Group objects are added. */ + unsigned kind; /**< \brief Internally-used kind of group. */ + unsigned subkind; /**< \brief Internally-used subkind to distinguish different levels of groups with same kind */ + unsigned char dont_merge; /**< \brief Flag preventing groups from being automatically merged with identical parent or children. */ + } group; + /** \brief PCI Device specific Object Attributes */ + struct hwloc_pcidev_attr_s { +#ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN + unsigned short domain; /* Only 16bits PCI domains are supported by default */ +#else + unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */ +#endif + unsigned char bus, dev, func; + unsigned short class_id; + unsigned short vendor_id, device_id, subvendor_id, subdevice_id; + unsigned char revision; + float linkspeed; /* in GB/s */ + } pcidev; + /** \brief Bridge specific Object Attribues */ + struct hwloc_bridge_attr_s { + union { + struct hwloc_pcidev_attr_s pci; + } upstream; + hwloc_obj_bridge_type_t upstream_type; + union { + struct { +#ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN + unsigned short domain; /* Only 16bits PCI domains are supported by default */ +#else + unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */ +#endif + unsigned char secondary_bus, subordinate_bus; + } pci; + } downstream; + hwloc_obj_bridge_type_t downstream_type; + unsigned depth; + } bridge; + /** \brief OS Device specific Object Attributes */ + struct hwloc_osdev_attr_s { + hwloc_obj_osdev_type_t type; + } osdev; +}; + +/** \brief Object info + * + * \sa hwlocality_info_attr + */ +struct hwloc_info_s { + char *name; /**< \brief Info name */ + char *value; /**< \brief Info value */ +}; + +/** @} */ + + + +/** \defgroup hwlocality_creation Topology Creation and Destruction + * @{ + */ + +struct hwloc_topology; +/** \brief Topology context + * + * To be initialized with hwloc_topology_init() and built with hwloc_topology_load(). + */ +typedef struct hwloc_topology * hwloc_topology_t; + +/** \brief Allocate a topology context. + * + * \param[out] topologyp is assigned a pointer to the new allocated context. + * + * \return 0 on success, -1 on error. + */ +HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp); + +/** \brief Build the actual topology + * + * Build the actual topology once initialized with hwloc_topology_init() and + * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines. + * No other routine may be called earlier using this topology context. + * + * \param topology is the topology to be loaded with objects. + * + * \return 0 on success, -1 on error. + * + * \note On failure, the topology is reinitialized. It should be either + * destroyed with hwloc_topology_destroy() or configured and loaded again. + * + * \note This function may be called only once per topology. + * + * \note The binding of the current thread or process may temporarily change + * during this call but it will be restored before it returns. + * + * \sa hwlocality_configuration and hwlocality_setsource + */ +HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology); + +/** \brief Terminate and free a topology context + * + * \param topology is the topology to be freed + */ +HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology); + +/** \brief Duplicate a topology. + * + * The entire topology structure as well as its objects + * are duplicated into a new one. + * + * This is useful for keeping a backup while modifying a topology. + * + * \note Object userdata is not duplicated since hwloc does not know what it point to. + * The objects of both old and new topologies will point to the same userdata. + */ +HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology); + +/** \brief Verify that the topology is compatible with the current hwloc library. + * + * This is useful when using the same topology structure (in memory) + * in different libraries that may use different hwloc installations + * (for instance if one library embeds a specific version of hwloc, + * while another library uses a default system-wide hwloc installation). + * + * If all libraries/programs use the same hwloc installation, this function + * always returns success. + * + * \return \c 0 on success. + * + * \return \c -1 with \p errno set to \c EINVAL if incompatible. + * + * \note If sharing between processes with hwloc_shmem_topology_write(), + * the relevant check is already performed inside hwloc_shmem_topology_adopt(). + */ +HWLOC_DECLSPEC int hwloc_topology_abi_check(hwloc_topology_t topology); + +/** \brief Run internal checks on a topology structure + * + * The program aborts if an inconsistency is detected in the given topology. + * + * \param topology is the topology to be checked + * + * \note This routine is only useful to developers. + * + * \note The input topology should have been previously loaded with + * hwloc_topology_load(). + */ +HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology); + +/** @} */ + + + +/** \defgroup hwlocality_levels Object levels, depths and types + * @{ + * + * Be sure to see the figure in \ref termsanddefs that shows a + * complete topology tree, including depths, child/sibling/cousin + * relationships, and an example of an asymmetric topology where one + * package has fewer caches than its peers. + */ + +/** \brief Get the depth of the hierarchical tree of objects. + * + * This is the depth of ::HWLOC_OBJ_PU objects plus one. + * + * \note NUMA nodes, I/O and Misc objects are ignored when computing + * the depth of the tree (they are placed on special levels). + */ +HWLOC_DECLSPEC int hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure; + +/** \brief Returns the depth of objects of type \p type. + * + * If no object of this type is present on the underlying architecture, or if + * the OS doesn't provide this kind of information, the function returns + * ::HWLOC_TYPE_DEPTH_UNKNOWN. + * + * If type is absent but a similar type is acceptable, see also + * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth(). + * + * If ::HWLOC_OBJ_GROUP is given, the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE + * if multiple levels of Groups exist. + * + * If a NUMA node, I/O or Misc object type is given, the function returns a virtual + * value because these objects are stored in special levels that are not CPU-related. + * This virtual depth may be passed to other hwloc functions such as + * hwloc_get_obj_by_depth() but it should not be considered as an actual + * depth by the application. In particular, it should not be compared with + * any other object depth or with the entire topology depth. + * \sa hwloc_get_memory_parents_depth(). + * + * \sa hwloc_type_sscanf_as_depth() for returning the depth of objects + * whose type is given as a string. + */ +HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type); + +enum hwloc_get_type_depth_e { + HWLOC_TYPE_DEPTH_UNKNOWN = -1, /**< \brief No object of given type exists in the topology. \hideinitializer */ + HWLOC_TYPE_DEPTH_MULTIPLE = -2, /**< \brief Objects of given type exist at different depth in the topology (only for Groups). \hideinitializer */ + HWLOC_TYPE_DEPTH_NUMANODE = -3, /**< \brief Virtual depth for NUMA nodes. \hideinitializer */ + HWLOC_TYPE_DEPTH_BRIDGE = -4, /**< \brief Virtual depth for bridge object level. \hideinitializer */ + HWLOC_TYPE_DEPTH_PCI_DEVICE = -5, /**< \brief Virtual depth for PCI device object level. \hideinitializer */ + HWLOC_TYPE_DEPTH_OS_DEVICE = -6, /**< \brief Virtual depth for software device object level. \hideinitializer */ + HWLOC_TYPE_DEPTH_MISC = -7, /**< \brief Virtual depth for Misc object. \hideinitializer */ + HWLOC_TYPE_DEPTH_MEMCACHE = -8 /**< \brief Virtual depth for MemCache object. \hideinitializer */ +}; + +/** \brief Return the depth of parents where memory objects are attached. + * + * Memory objects have virtual negative depths because they are not part of + * the main CPU-side hierarchy of objects. This depth should not be compared + * with other level depths. + * + * If all Memory objects are attached to Normal parents at the same depth, + * this parent depth may be compared to other as usual, for instance + * for knowing whether NUMA nodes is attached above or below Packages. + * + * \return The depth of Normal parents of all memory children + * if all these parents have the same depth. For instance the depth of + * the Package level if all NUMA nodes are attached to Package objects. + * + * \return ::HWLOC_TYPE_DEPTH_MULTIPLE if Normal parents of all + * memory children do not have the same depth. For instance if some + * NUMA nodes are attached to Packages while others are attached to + * Groups. + */ +HWLOC_DECLSPEC int hwloc_get_memory_parents_depth (hwloc_topology_t topology); + +/** \brief Returns the depth of objects of type \p type or below + * + * If no object of this type is present on the underlying architecture, the + * function returns the depth of the first "present" object typically found + * inside \p type. + * + * This function is only meaningful for normal object types. + * If a memory, I/O or Misc object type is given, the corresponding virtual + * depth is always returned (see hwloc_get_type_depth()). + * + * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like + * hwloc_get_type_depth(). + */ +static __hwloc_inline int +hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure; + +/** \brief Returns the depth of objects of type \p type or above + * + * If no object of this type is present on the underlying architecture, the + * function returns the depth of the first "present" object typically + * containing \p type. + * + * This function is only meaningful for normal object types. + * If a memory, I/O or Misc object type is given, the corresponding virtual + * depth is always returned (see hwloc_get_type_depth()). + * + * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like + * hwloc_get_type_depth(). + */ +static __hwloc_inline int +hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure; + +/** \brief Returns the type of objects at depth \p depth. + * + * \p depth should between 0 and hwloc_topology_get_depth()-1, + * or a virtual depth such as ::HWLOC_TYPE_DEPTH_NUMANODE. + * + * \return (hwloc_obj_type_t)-1 if depth \p depth does not exist. + */ +HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, int depth) __hwloc_attribute_pure; + +/** \brief Returns the width of level at depth \p depth. + */ +HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, int depth) __hwloc_attribute_pure; + +/** \brief Returns the width of level type \p type + * + * If no object for that type exists, 0 is returned. + * If there are several levels with objects of that type, -1 is returned. + */ +static __hwloc_inline int +hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure; + +/** \brief Returns the top-object of the topology-tree. + * + * Its type is ::HWLOC_OBJ_MACHINE. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Returns the topology object at logical index \p idx from depth \p depth */ +HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, int depth, unsigned idx) __hwloc_attribute_pure; + +/** \brief Returns the topology object at logical index \p idx with type \p type + * + * If no object for that type exists, \c NULL is returned. + * If there are several levels with objects of that type (::HWLOC_OBJ_GROUP), + * \c NULL is returned and the caller may fallback to hwloc_get_obj_by_depth(). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure; + +/** \brief Returns the next object at depth \p depth. + * + * If \p prev is \c NULL, return the first object at depth \p depth. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev); + +/** \brief Returns the next object of type \p type. + * + * If \p prev is \c NULL, return the first object at type \p type. If + * there are multiple or no depth for given type, return \c NULL and + * let the caller fallback to hwloc_get_next_obj_by_depth(). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, + hwloc_obj_t prev); + +/** @} */ + + + +/** \defgroup hwlocality_object_strings Converting between Object Types and Attributes, and Strings + * @{ + */ + +/** \brief Return a constant stringified object type. + * + * This function is the basic way to convert a generic type into a string. + * The output string may be parsed back by hwloc_type_sscanf(). + * + * hwloc_obj_type_snprintf() may return a more precise output for a specific + * object, but it requires the caller to provide the output buffer. + */ +HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const; + +/** \brief Stringify the type of a given topology object into a human-readable form. + * + * Contrary to hwloc_obj_type_string(), this function includes object-specific + * attributes (such as the Group depth, the Bridge type, or OS device type) + * in the output, and it requires the caller to provide the output buffer. + * + * The output is guaranteed to be the same for all objects of a same topology level. + * + * If \p verbose is 1, longer type names are used, e.g. L1Cache instead of L1. + * + * The output string may be parsed back by hwloc_type_sscanf(). + * + * If \p size is 0, \p string may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, + hwloc_obj_t obj, + int verbose); + +/** \brief Stringify the attributes of a given topology object into a human-readable form. + * + * Attribute values are separated by \p separator. + * + * Only the major attributes are printed in non-verbose mode. + * + * If \p size is 0, \p string may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, + hwloc_obj_t obj, const char * __hwloc_restrict separator, + int verbose); + +/** \brief Return an object type and attributes from a type string. + * + * Convert strings such as "Package" or "L1iCache" into the corresponding types. + * Matching is case-insensitive, and only the first letters are actually + * required to match. + * + * The matched object type is set in \p typep (which cannot be \c NULL). + * + * Type-specific attributes, for instance Cache type, Cache depth, Group depth, + * Bridge type or OS Device type may be returned in \p attrp. + * Attributes that are not specified in the string (for instance "Group" + * without a depth, or "L2Cache" without a cache type) are set to -1. + * + * \p attrp is only filled if not \c NULL and if its size specified in \p attrsize + * is large enough. It should be at least as large as union hwloc_obj_attr_u. + * + * \return 0 if a type was correctly identified, otherwise -1. + * + * \note This function is guaranteed to match any string returned by + * hwloc_obj_type_string() or hwloc_obj_type_snprintf(). + * + * \note This is an extended version of the now deprecated hwloc_obj_type_sscanf(). + */ +HWLOC_DECLSPEC int hwloc_type_sscanf(const char *string, + hwloc_obj_type_t *typep, + union hwloc_obj_attr_u *attrp, size_t attrsize); + +/** \brief Return an object type and its level depth from a type string. + * + * Convert strings such as "Package" or "L1iCache" into the corresponding types + * and return in \p depthp the depth of the corresponding level in the + * topology \p topology. + * + * If no object of this type is present on the underlying architecture, + * ::HWLOC_TYPE_DEPTH_UNKNOWN is returned. + * + * If multiple such levels exist (for instance if giving Group without any depth), + * the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE instead. + * + * The matched object type is set in \p typep if \p typep is non \c NULL. + * + * \note This function is similar to hwloc_type_sscanf() followed + * by hwloc_get_type_depth() but it also automatically disambiguates + * multiple group levels etc. + * + * \note This function is guaranteed to match any string returned by + * hwloc_obj_type_string() or hwloc_obj_type_snprintf(). + */ +HWLOC_DECLSPEC int hwloc_type_sscanf_as_depth(const char *string, + hwloc_obj_type_t *typep, + hwloc_topology_t topology, int *depthp); + +/** @} */ + + + +/** \defgroup hwlocality_info_attr Consulting and Adding Key-Value Info Attributes + * + * @{ + */ + +/** \brief Search the given key name in object infos and return the corresponding value. + * + * If multiple keys match the given name, only the first one is returned. + * + * \return \c NULL if no such key exists. + */ +static __hwloc_inline const char * +hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure; + +/** \brief Add the given info name and value pair to the given object. + * + * The info is appended to the existing info array even if another key + * with the same name already exists. + * + * The input strings are copied before being added in the object infos. + * + * \return \c 0 on success, \c -1 on error. + * + * \note This function may be used to enforce object colors in the lstopo + * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb" + * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details. + * + * \note If \p value contains some non-printable characters, they will + * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h. + */ +HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value); + +/** @} */ + + + +/** \defgroup hwlocality_cpubinding CPU binding + * + * Some operating systems only support binding threads or processes to a single PU. + * Others allow binding to larger sets such as entire Cores or Packages or + * even random sets of invididual PUs. In such operating system, the scheduler + * is free to run the task on one of these PU, then migrate it to another PU, etc. + * It is often useful to call hwloc_bitmap_singlify() on the target CPU set before + * passing it to the binding function to avoid these expensive migrations. + * See the documentation of hwloc_bitmap_singlify() for details. + * + * Some operating systems do not provide all hwloc-supported + * mechanisms to bind processes, threads, etc. + * hwloc_topology_get_support() may be used to query about the actual CPU + * binding support in the currently used operating system. + * + * When the requested binding operation is not available and the + * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1. + * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object + * processes/threads. errno is set to \c EXDEV when the requested cpuset + * can not be enforced (e.g. some systems only allow one CPU, and some + * other systems only allow one NUMA node). + * + * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well, + * or the operating system may use a slightly different operation + * (with side-effects, smaller binding set, etc.) + * when the requested operation is not exactly supported. + * + * The most portable version that should be preferred over the others, + * whenever possible, is the following one which just binds the current program, + * assuming it is single-threaded: + * + * \code + * hwloc_set_cpubind(topology, set, 0), + * \endcode + * + * If the program may be multithreaded, the following one should be preferred + * to only bind the current thread: + * + * \code + * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD), + * \endcode + * + * \sa Some example codes are available under doc/examples/ in the source tree. + * + * \note To unbind, just call the binding function with either a full cpuset or + * a cpuset equal to the system cpuset. + * + * \note On some operating systems, CPU binding may have effects on memory binding, see + * ::HWLOC_CPUBIND_NOMEMBIND + * + * \note Running lstopo \--top or hwloc-ps can be a very convenient tool to check + * how binding actually happened. + * @{ + */ + +/** \brief Process/Thread binding flags. + * + * These bit flags can be used to refine the binding policy. + * + * The default (0) is to bind the current process, assumed to be + * single-threaded, in a non-strict way. This is the most portable + * way to bind as all operating systems usually provide it. + * + * \note Not all systems support all kinds of binding. See the + * "Detailed Description" section of \ref hwlocality_cpubinding for a + * description of errors that can occur. + */ +typedef enum { + /** \brief Bind all threads of the current (possibly) multithreaded process. + * \hideinitializer */ + HWLOC_CPUBIND_PROCESS = (1<<0), + + /** \brief Bind current thread of current process. + * \hideinitializer */ + HWLOC_CPUBIND_THREAD = (1<<1), + + /** \brief Request for strict binding from the OS. + * + * By default, when the designated CPUs are all busy while other + * CPUs are idle, operating systems may execute the thread/process + * on those other CPUs instead of the designated CPUs, to let them + * progress anyway. Strict binding means that the thread/process + * will _never_ execute on other cpus than the designated CPUs, even + * when those are busy with other tasks and other CPUs are idle. + * + * \note Depending on the operating system, strict binding may not + * be possible (e.g., the OS does not implement it) or not allowed + * (e.g., for an administrative reasons), and the function will fail + * in that case. + * + * When retrieving the binding of a process, this flag checks + * whether all its threads actually have the same binding. If the + * flag is not given, the binding of each thread will be + * accumulated. + * + * \note This flag is meaningless when retrieving the binding of a + * thread. + * \hideinitializer + */ + HWLOC_CPUBIND_STRICT = (1<<2), + + /** \brief Avoid any effect on memory binding + * + * On some operating systems, some CPU binding function would also + * bind the memory on the corresponding NUMA node. It is often not + * a problem for the application, but if it is, setting this flag + * will make hwloc avoid using OS functions that would also bind + * memory. This will however reduce the support of CPU bindings, + * i.e. potentially return -1 with errno set to ENOSYS in some + * cases. + * + * This flag is only meaningful when used with functions that set + * the CPU binding. It is ignored when used with functions that get + * CPU binding information. + * \hideinitializer + */ + HWLOC_CPUBIND_NOMEMBIND = (1<<3) +} hwloc_cpubind_flags_t; + +/** \brief Bind current process or thread on cpus given in physical bitmap \p set. + * + * \return -1 with errno set to ENOSYS if the action is not supported + * \return -1 with errno set to EXDEV if the binding cannot be enforced + */ +HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags); + +/** \brief Get current process or thread binding. + * + * Writes into \p set the physical cpuset which the process or thread (according to \e + * flags) was last bound to. + */ +HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags); + +/** \brief Bind a process \p pid on cpus given in physical bitmap \p set. + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note As a special case on Linux, if a tid (thread ID) is supplied + * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags, + * the binding is applied to that specific thread. + * + * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags. + */ +HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags); + +/** \brief Get the current physical binding of process \p pid. + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note As a special case on Linux, if a tid (thread ID) is supplied + * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags, + * the binding for that specific thread is returned. + * + * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags. + */ +HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags); + +#ifdef hwloc_thread_t +/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set. + * + * \note \p hwloc_thread_t is \p pthread_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags. + */ +HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags); +#endif + +#ifdef hwloc_thread_t +/** \brief Get the current physical binding of thread \p tid. + * + * \note \p hwloc_thread_t is \p pthread_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags. + */ +HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags); +#endif + +/** \brief Get the last physical CPU where the current process or thread ran. + * + * The operating system may move some tasks from one processor + * to another at any time according to their binding, + * so this function may return something that is already + * outdated. + * + * \p flags can include either ::HWLOC_CPUBIND_PROCESS or ::HWLOC_CPUBIND_THREAD to + * specify whether the query should be for the whole process (union of all CPUs + * on which all threads are running), or only the current thread. If the + * process is single-threaded, flags can be set to zero to let hwloc use + * whichever method is available on the underlying OS. + */ +HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags); + +/** \brief Get the last physical CPU where a process ran. + * + * The operating system may move some tasks from one processor + * to another at any time according to their binding, + * so this function may return something that is already + * outdated. + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note As a special case on Linux, if a tid (thread ID) is supplied + * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags, + * the last CPU location of that specific thread is returned. + * + * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags. + */ +HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags); + +/** @} */ + + + +/** \defgroup hwlocality_membinding Memory binding + * + * Memory binding can be done three ways: + * + * - explicit memory allocation thanks to hwloc_alloc_membind() and friends: + * the binding will have effect on the memory allocated by these functions. + * - implicit memory binding through binding policy: hwloc_set_membind() and + * friends only define the current policy of the process, which will be + * applied to the subsequent calls to malloc() and friends. + * - migration of existing memory ranges, thanks to hwloc_set_area_membind() + * and friends, which move already-allocated data. + * + * Not all operating systems support all three ways. + * hwloc_topology_get_support() may be used to query about the actual memory + * binding support in the currently used operating system. + * + * When the requested binding operation is not available and the + * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1. + * \p errno will be set to \c ENOSYS when the system does support + * the specified action or policy + * (e.g., some systems only allow binding memory on a per-thread + * basis, whereas other systems only allow binding memory for all + * threads in a process). + * \p errno will be set to EXDEV when the requested set can not be enforced + * (e.g., some systems only allow binding memory to a single NUMA node). + * + * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well, + * or the operating system may use a slightly different operation + * (with side-effects, smaller binding set, etc.) + * when the requested operation is not exactly supported. + * + * The most portable form that should be preferred over the others + * whenever possible is as follows. + * It allocates some memory hopefully bound to the specified set. + * To do so, hwloc will possibly have to change the current memory + * binding policy in order to actually get the memory bound, if the OS + * does not provide any other way to simply allocate bound memory + * without changing the policy for all allocations. That is the + * difference with hwloc_alloc_membind(), which will never change the + * current memory binding policy. + * + * \code + * hwloc_alloc_membind_policy(topology, size, set, + * HWLOC_MEMBIND_BIND, 0); + * \endcode + * + * Each hwloc memory binding function takes a bitmap argument that + * is a CPU set by default, or a NUMA memory node set if the flag + * ::HWLOC_MEMBIND_BYNODESET is specified. + * See \ref hwlocality_object_sets and \ref hwlocality_bitmap for a + * discussion of CPU sets and NUMA memory node sets. + * It is also possible to convert between CPU set and node set using + * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset(). + * + * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes. + * Binding by nodeset should therefore be preferred whenever possible. + * + * \sa Some example codes are available under doc/examples/ in the source tree. + * + * \note On some operating systems, memory binding affects the CPU + * binding; see ::HWLOC_MEMBIND_NOCPUBIND + * @{ + */ + +/** \brief Memory binding policy. + * + * These constants can be used to choose the binding policy. Only one policy can + * be used at a time (i.e., the values cannot be OR'ed together). + * + * Not all systems support all kinds of binding. + * hwloc_topology_get_support() may be used to query about the actual memory + * binding policy support in the currently used operating system. + * See the "Detailed Description" section of \ref hwlocality_membinding + * for a description of errors that can occur. + */ +typedef enum { + /** \brief Reset the memory allocation policy to the system default. + * Depending on the operating system, this may correspond to + * ::HWLOC_MEMBIND_FIRSTTOUCH (Linux, FreeBSD), + * or ::HWLOC_MEMBIND_BIND (AIX, HP-UX, Solaris, Windows). + * This policy is never returned by get membind functions. + * The nodeset argument is ignored. + * \hideinitializer */ + HWLOC_MEMBIND_DEFAULT = 0, + + /** \brief Allocate each memory page individually on the local NUMA + * node of the thread that touches it. + * + * The given nodeset should usually be hwloc_topology_get_topology_nodeset() + * so that the touching thread may run and allocate on any node in the system. + * + * On AIX, if the nodeset is smaller, pages are allocated locally (if the local + * node is in the nodeset) or from a random non-local node (otherwise). + * \hideinitializer */ + HWLOC_MEMBIND_FIRSTTOUCH = 1, + + /** \brief Allocate memory on the specified nodes. + * \hideinitializer */ + HWLOC_MEMBIND_BIND = 2, + + /** \brief Allocate memory on the given nodes in an interleaved + * / round-robin manner. The precise layout of the memory across + * multiple NUMA nodes is OS/system specific. Interleaving can be + * useful when threads distributed across the specified NUMA nodes + * will all be accessing the whole memory range concurrently, since + * the interleave will then balance the memory references. + * \hideinitializer */ + HWLOC_MEMBIND_INTERLEAVE = 3, + + /** \brief For each page bound with this policy, by next time + * it is touched (and next time only), it is moved from its current + * location to the local NUMA node of the thread where the memory + * reference occurred (if it needs to be moved at all). + * \hideinitializer */ + HWLOC_MEMBIND_NEXTTOUCH = 4, + + /** \brief Returned by get_membind() functions when multiple + * threads or parts of a memory area have differing memory binding + * policies. + * Also returned when binding is unknown because binding hooks are empty + * when the topology is loaded from XML without HWLOC_THISSYSTEM=1, etc. + * \hideinitializer */ + HWLOC_MEMBIND_MIXED = -1 +} hwloc_membind_policy_t; + +/** \brief Memory binding flags. + * + * These flags can be used to refine the binding policy. + * All flags can be logically OR'ed together with the exception of + * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD; + * these two flags are mutually exclusive. + * + * Not all systems support all kinds of binding. + * hwloc_topology_get_support() may be used to query about the actual memory + * binding support in the currently used operating system. + * See the "Detailed Description" section of \ref hwlocality_membinding + * for a description of errors that can occur. + */ +typedef enum { + /** \brief Set policy for all threads of the specified (possibly + * multithreaded) process. This flag is mutually exclusive with + * ::HWLOC_MEMBIND_THREAD. + * \hideinitializer */ + HWLOC_MEMBIND_PROCESS = (1<<0), + + /** \brief Set policy for a specific thread of the current process. + * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS. + * \hideinitializer */ + HWLOC_MEMBIND_THREAD = (1<<1), + + /** Request strict binding from the OS. The function will fail if + * the binding can not be guaranteed / completely enforced. + * + * This flag has slightly different meanings depending on which + * function it is used with. + * \hideinitializer */ + HWLOC_MEMBIND_STRICT = (1<<2), + + /** \brief Migrate existing allocated memory. If the memory cannot + * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error + * will be returned. + * \hideinitializer */ + HWLOC_MEMBIND_MIGRATE = (1<<3), + + /** \brief Avoid any effect on CPU binding. + * + * On some operating systems, some underlying memory binding + * functions also bind the application to the corresponding CPU(s). + * Using this flag will cause hwloc to avoid using OS functions that + * could potentially affect CPU bindings. Note, however, that using + * NOCPUBIND may reduce hwloc's overall memory binding + * support. Specifically: some of hwloc's memory binding functions + * may fail with errno set to ENOSYS when used with NOCPUBIND. + * \hideinitializer + */ + HWLOC_MEMBIND_NOCPUBIND = (1<<4), + + /** \brief Consider the bitmap argument as a nodeset. + * + * The bitmap argument is considered a nodeset if this flag is given, + * or a cpuset otherwise by default. + * + * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes. + * Binding by nodeset should therefore be preferred whenever possible. + * \hideinitializer + */ + HWLOC_MEMBIND_BYNODESET = (1<<5) +} hwloc_membind_flags_t; + +/** \brief Set the default memory binding policy of the current + * process or thread to prefer the NUMA node(s) specified by \p set + * + * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is + * specified, the current process is assumed to be single-threaded. + * This is the most portable form as it permits hwloc to use either + * process-based OS functions or thread-based OS functions, depending + * on which are available. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * \return -1 with errno set to ENOSYS if the action is not supported + * \return -1 with errno set to EXDEV if the binding cannot be enforced + */ +HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags); + +/** \brief Query the default memory binding policy and physical locality of the + * current process or thread. + * + * This function has two output parameters: \p set and \p policy. + * The values returned in these parameters depend on both the \p flags + * passed in and the current memory binding policies and nodesets in + * the queried target. + * + * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query + * target is the current policies and nodesets for all the threads in + * the current process. Passing ::HWLOC_MEMBIND_THREAD specifies that + * the query target is the current policy and nodeset for only the + * thread invoking this function. + * + * If neither of these flags are passed (which is the most portable + * method), the process is assumed to be single threaded. This allows + * hwloc to use either process-based OS functions or thread-based OS + * functions, depending on which are available. + * + * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS + * is also specified. In this case, hwloc will check the default + * memory policies and nodesets for all threads in the process. If + * they are not identical, -1 is returned and errno is set to EXDEV. + * If they are identical, the values are returned in \p set and \p + * policy. + * + * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and + * ::HWLOC_MEMBIND_STRICT is \em not specified), the default set + * from each thread is logically OR'ed together. + * If all threads' default policies are the same, \p policy is set to + * that policy. If they are different, \p policy is set to + * ::HWLOC_MEMBIND_MIXED. + * + * In the ::HWLOC_MEMBIND_THREAD case (or when neither + * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there + * is only one set and policy; they are returned in \p set and + * \p policy, respectively. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * If any other flags are specified, -1 is returned and errno is set + * to EINVAL. + */ +HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags); + +/** \brief Set the default memory binding policy of the specified + * process to prefer the NUMA node(s) specified by \p set + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * \return -1 with errno set to ENOSYS if the action is not supported + * \return -1 with errno set to EXDEV if the binding cannot be enforced + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + */ +HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags); + +/** \brief Query the default memory binding policy and physical locality of the + * specified process. + * + * This function has two output parameters: \p set and \p policy. + * The values returned in these parameters depend on both the \p flags + * passed in and the current memory binding policies and nodesets in + * the queried target. + * + * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query + * target is the current policies and nodesets for all the threads in + * the specified process. If ::HWLOC_MEMBIND_PROCESS is not specified + * (which is the most portable method), the process is assumed to be + * single threaded. This allows hwloc to use either process-based OS + * functions or thread-based OS functions, depending on which are + * available. + * + * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to + * this function. + * + * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default + * memory policies and nodesets for all threads in the specified + * process. If they are not identical, -1 is returned and errno is + * set to EXDEV. If they are identical, the values are returned in \p + * set and \p policy. + * + * Otherwise, \p set is set to the logical OR of all threads' + * default set. If all threads' default policies + * are the same, \p policy is set to that policy. If they are + * different, \p policy is set to ::HWLOC_MEMBIND_MIXED. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * If any other flags are specified, -1 is returned and errno is set + * to EINVAL. + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + */ +HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags); + +/** \brief Bind the already-allocated memory identified by (addr, len) + * to the NUMA node(s) specified by \p set. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * \return 0 if \p len is 0. + * \return -1 with errno set to ENOSYS if the action is not supported + * \return -1 with errno set to EXDEV if the binding cannot be enforced + */ +HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags); + +/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of + * the memory identified by (\p addr, \p len ). + * + * This function has two output parameters: \p set and \p policy. + * The values returned in these parameters depend on both the \p flags + * passed in and the memory binding policies and nodesets of the pages + * in the address range. + * + * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first + * checked to see if they all have the same memory binding policy and + * nodeset. If they do not, -1 is returned and errno is set to EXDEV. + * If they are identical across all pages, the set and policy are + * returned in \p set and \p policy, respectively. + * + * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA + * node(s) containing pages in the address range is calculated. + * If all pages in the target have the same policy, it is returned in + * \p policy. Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * If any other flags are specified, -1 is returned and errno is set + * to EINVAL. + * + * If \p len is 0, -1 is returned and errno is set to EINVAL. + */ +HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags); + +/** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated. + * + * Fills \p set according to the NUMA nodes where the memory area pages + * are physically allocated. If no page is actually allocated yet, + * \p set may be empty. + * + * If pages spread to multiple nodes, it is not specified whether they spread + * equitably, or whether most of them are on a single node, etc. + * + * The operating system may move memory pages from one processor + * to another at any time according to their binding, + * so this function may return something that is already + * outdated. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified in \p flags, set is + * considered a nodeset. Otherwise it's a cpuset. + * + * If \p len is 0, \p set is emptied. + */ +HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, int flags); + +/** \brief Allocate some memory + * + * This is equivalent to malloc(), except that it tries to allocate + * page-aligned memory from the OS. + * + * \note The allocated memory should be freed with hwloc_free(). + */ +HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len); + +/** \brief Allocate some memory on NUMA memory nodes specified by \p set + * + * \return NULL with errno set to ENOSYS if the action is not supported + * and ::HWLOC_MEMBIND_STRICT is given + * \return NULL with errno set to EXDEV if the binding cannot be enforced + * and ::HWLOC_MEMBIND_STRICT is given + * \return NULL with errno set to ENOMEM if the memory allocation failed + * even before trying to bind. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + * + * \note The allocated memory should be freed with hwloc_free(). + */ +HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc; + +/** \brief Allocate some memory on NUMA memory nodes specified by \p set + * + * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change + * the current memory binding policy, thus providing more binding support, at + * the expense of changing the current state. + * + * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset. + * Otherwise it's a cpuset. + */ +static __hwloc_inline void * +hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc; + +/** \brief Free memory that was previously allocated by hwloc_alloc() + * or hwloc_alloc_membind(). + */ +HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len); + +/** @} */ + + + +/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery + * + * If none of the functions below is called, the default is to detect all the objects + * of the machine that the caller is allowed to access. + * + * This default behavior may also be modified through environment variables + * if the application did not modify it already. + * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML + * file as if hwloc_topology_set_xml() had been called. + * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if + * hwloc_topology_set_synthetic() had been called. + * + * Finally, HWLOC_THISSYSTEM enforces the return value of + * hwloc_topology_is_thissystem(). + * + * @{ + */ + +/** \brief Change which process the topology is viewed from. + * + * On some systems, processes may have different views of the machine, for + * instance the set of allowed CPUs. By default, hwloc exposes the view from + * the current process. Calling hwloc_topology_set_pid() permits to make it + * expose the topology of the machine from the point of view of another + * process. + * + * \note \p hwloc_pid_t is \p pid_t on Unix platforms, + * and \p HANDLE on native Windows platforms. + * + * \note -1 is returned and errno is set to ENOSYS on platforms that do not + * support this feature. + */ +HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid); + +/** \brief Enable synthetic topology. + * + * Gather topology information from the given \p description, + * a space-separated string of <type:number> describing + * the object type and arity at each level. + * All types may be omitted (space-separated string of numbers) so that + * hwloc chooses all types according to usual topologies. + * See also the \ref synthetic. + * + * Setting the environment variable HWLOC_SYNTHETIC + * may also result in this behavior. + * + * If \p description was properly parsed and describes a valid topology + * configuration, this function returns 0. + * Otherwise -1 is returned and errno is set to EINVAL. + * + * Note that this function does not actually load topology + * information; it just tells hwloc where to load it from. You'll + * still need to invoke hwloc_topology_load() to actually load the + * topology information. + * + * \note For convenience, this backend provides empty binding hooks which just + * return success. + * + * \note On success, the synthetic component replaces the previously enabled + * component (if any), but the topology is not actually modified until + * hwloc_topology_load(). + */ +HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description); + +/** \brief Enable XML-file based topology. + * + * Gather topology information from the XML file given at \p xmlpath. + * Setting the environment variable HWLOC_XMLFILE may also result in this behavior. + * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h, + * or lstopo file.xml. + * + * Note that this function does not actually load topology + * information; it just tells hwloc where to load it from. You'll + * still need to invoke hwloc_topology_load() to actually load the + * topology information. + * + * \return -1 with errno set to EINVAL on failure to read the XML file. + * + * \note See also hwloc_topology_set_userdata_import_callback() + * for importing application-specific object userdata. + * + * \note For convenience, this backend provides empty binding hooks which just + * return success. To have hwloc still actually call OS-specific hooks, the + * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded + * file is really the underlying system. + * + * \note On success, the XML component replaces the previously enabled + * component (if any), but the topology is not actually modified until + * hwloc_topology_load(). + */ +HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath); + +/** \brief Enable XML based topology using a memory buffer (instead of + * a file, as with hwloc_topology_set_xml()). + * + * Gather topology information from the XML memory buffer given at \p + * buffer and of length \p size. This buffer may have been filled + * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h. + * + * Note that this function does not actually load topology + * information; it just tells hwloc where to load it from. You'll + * still need to invoke hwloc_topology_load() to actually load the + * topology information. + * + * \return -1 with errno set to EINVAL on failure to read the XML buffer. + * + * \note See also hwloc_topology_set_userdata_import_callback() + * for importing application-specific object userdata. + * + * \note For convenience, this backend provides empty binding hooks which just + * return success. To have hwloc still actually call OS-specific hooks, the + * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded + * file is really the underlying system. + * + * \note On success, the XML component replaces the previously enabled + * component (if any), but the topology is not actually modified until + * hwloc_topology_load(). + */ +HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size); + +/** \brief Flags to be passed to hwloc_topology_set_components() + */ +enum hwloc_topology_components_flag_e { + /** \brief Blacklist the target component from being used. + * \hideinitializer + */ + HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST = (1UL<<0) +}; + +/** \brief Prevent a discovery component from being used for a topology. + * + * \p name is the name of the discovery component that should not be used + * when loading topology \p topology. The name is a string such as "cuda". + * + * For components with multiple phases, it may also be suffixed with the name + * of a phase, for instance "linux:io". + * + * \p flags should be ::HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST. + * + * This may be used to avoid expensive parts of the discovery process. + * For instance, CUDA-specific discovery may be expensive and unneeded + * while generic I/O discovery could still be useful. + */ +HWLOC_DECLSPEC int hwloc_topology_set_components(hwloc_topology_t __hwloc_restrict topology, unsigned long flags, const char * __hwloc_restrict name); + +/** @} */ + + + +/** \defgroup hwlocality_configuration Topology Detection Configuration and Query + * + * Several functions can optionally be called between hwloc_topology_init() and + * hwloc_topology_load() to configure how the detection should be performed, + * e.g. to ignore some objects types, define a synthetic topology, etc. + * + * @{ + */ + +/** \brief Flags to be set onto a topology context before load. + * + * Flags should be given to hwloc_topology_set_flags(). + * They may also be returned by hwloc_topology_get_flags(). + */ +enum hwloc_topology_flags_e { + /** \brief Detect the whole system, ignore reservations, include disallowed objects. + * + * Gather all resources, even if some were disabled by the administrator. + * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes. + * + * When this flag is not set, PUs and NUMA nodes that are disallowed are not added to the topology. + * Parent objects (package, core, cache, etc.) are added only if some of their children are allowed. + * All existing PUs and NUMA nodes in the topology are allowed. + * hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset() + * are equal to the root object cpuset and nodeset. + * + * When this flag is set, the actual sets of allowed PUs and NUMA nodes are given + * by hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset(). + * They may be smaller than the root object cpuset and nodeset. + * + * If the current topology is exported to XML and reimported later, this flag + * should be set again in the reimported topology so that disallowed resources + * are reimported as well. + * \hideinitializer + */ + HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED = (1UL<<0), + + /** \brief Assume that the selected backend provides the topology for the + * system on which we are running. + * + * This forces hwloc_topology_is_thissystem() to return 1, i.e. makes hwloc assume that + * the selected backend provides the topology for the system on which we are running, + * even if it is not the OS-specific backend but the XML backend for instance. + * This means making the binding functions actually call the OS-specific + * system calls and really do binding, while the XML backend would otherwise + * provide empty hooks just returning success. + * + * Setting the environment variable HWLOC_THISSYSTEM may also result in the + * same behavior. + * + * This can be used for efficiency reasons to first detect the topology once, + * save it to an XML file, and quickly reload it later through the XML + * backend, but still having binding functions actually do bind. + * \hideinitializer + */ + HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1), + + /** \brief Get the set of allowed resources from the local operating system even if the topology was loaded from XML or synthetic description. + * + * If the topology was loaded from XML or from a synthetic string, + * restrict it by applying the current process restrictions such as + * Linux Cgroup/Cpuset. + * + * This is useful when the topology is not loaded directly from + * the local machine (e.g. for performance reason) and it comes + * with all resources, while the running process is restricted + * to only parts of the machine. + * + * This flag is ignored unless ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM is + * also set since the loaded topology must match the underlying machine + * where restrictions will be gathered from. + * + * Setting the environment variable HWLOC_THISSYSTEM_ALLOWED_RESOURCES + * would result in the same behavior. + * \hideinitializer + */ + HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES = (1UL<<2), + + /** \brief Import support from the imported topology. + * + * When importing a XML topology from a remote machine, binding is + * disabled by default (see ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM). + * This disabling is also marked by putting zeroes in the corresponding + * supported feature bits reported by hwloc_topology_get_support(). + * + * The flag ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT actually imports + * support bits from the remote machine. It also sets the flag + * \p imported_support in the struct hwloc_topology_misc_support array. + * If the imported XML did not contain any support information + * (exporter hwloc is too old), this flag is not set. + * + * Note that these supported features are only relevant for the hwloc + * installation that actually exported the XML topology + * (it may vary with the operating system, or with how hwloc was compiled). + * + * Note that setting this flag however does not enable binding for the + * locally imported hwloc topology, it only reports what the remote + * hwloc and machine support. + * + */ + HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT = (1UL<<3) +}; + +/** \brief Set OR'ed flags to non-yet-loaded topology. + * + * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded. + * + * If this function is called multiple times, the last invokation will erase + * and replace the set of flags that was previously set. + * + * The flags set in a topology may be retrieved with hwloc_topology_get_flags() + */ +HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags); + +/** \brief Get OR'ed flags of a topology. + * + * Get the OR'ed set of ::hwloc_topology_flags_e of a topology. + * + * \return the flags previously set with hwloc_topology_set_flags(). + */ +HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology); + +/** \brief Does the topology context come from this system? + * + * \return 1 if this topology context was built using the system + * running this program. + * \return 0 instead (for instance if using another file-system root, + * a XML topology file, or a synthetic topology). + */ +HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure; + +/** \brief Flags describing actual discovery support for this topology. */ +struct hwloc_topology_discovery_support { + /** \brief Detecting the number of PU objects is supported. */ + unsigned char pu; + /** \brief Detecting the number of NUMA nodes is supported. */ + unsigned char numa; + /** \brief Detecting the amount of memory in NUMA nodes is supported. */ + unsigned char numa_memory; + /** \brief Detecting and identifying PU objects that are not available to the current process is supported. */ + unsigned char disallowed_pu; + /** \brief Detecting and identifying NUMA nodes that are not available to the current process is supported. */ + unsigned char disallowed_numa; + /** \brief Detecting the efficiency of CPU kinds is supported, see \ref hwlocality_cpukinds. */ + unsigned char cpukind_efficiency; +}; + +/** \brief Flags describing actual PU binding support for this topology. + * + * A flag may be set even if the feature isn't supported in all cases + * (e.g. binding to random sets of non-contiguous objects). + */ +struct hwloc_topology_cpubind_support { + /** Binding the whole current process is supported. */ + unsigned char set_thisproc_cpubind; + /** Getting the binding of the whole current process is supported. */ + unsigned char get_thisproc_cpubind; + /** Binding a whole given process is supported. */ + unsigned char set_proc_cpubind; + /** Getting the binding of a whole given process is supported. */ + unsigned char get_proc_cpubind; + /** Binding the current thread only is supported. */ + unsigned char set_thisthread_cpubind; + /** Getting the binding of the current thread only is supported. */ + unsigned char get_thisthread_cpubind; + /** Binding a given thread only is supported. */ + unsigned char set_thread_cpubind; + /** Getting the binding of a given thread only is supported. */ + unsigned char get_thread_cpubind; + /** Getting the last processors where the whole current process ran is supported */ + unsigned char get_thisproc_last_cpu_location; + /** Getting the last processors where a whole process ran is supported */ + unsigned char get_proc_last_cpu_location; + /** Getting the last processors where the current thread ran is supported */ + unsigned char get_thisthread_last_cpu_location; +}; + +/** \brief Flags describing actual memory binding support for this topology. + * + * A flag may be set even if the feature isn't supported in all cases + * (e.g. binding to random sets of non-contiguous objects). + */ +struct hwloc_topology_membind_support { + /** Binding the whole current process is supported. */ + unsigned char set_thisproc_membind; + /** Getting the binding of the whole current process is supported. */ + unsigned char get_thisproc_membind; + /** Binding a whole given process is supported. */ + unsigned char set_proc_membind; + /** Getting the binding of a whole given process is supported. */ + unsigned char get_proc_membind; + /** Binding the current thread only is supported. */ + unsigned char set_thisthread_membind; + /** Getting the binding of the current thread only is supported. */ + unsigned char get_thisthread_membind; + /** Binding a given memory area is supported. */ + unsigned char set_area_membind; + /** Getting the binding of a given memory area is supported. */ + unsigned char get_area_membind; + /** Allocating a bound memory area is supported. */ + unsigned char alloc_membind; + /** First-touch policy is supported. */ + unsigned char firsttouch_membind; + /** Bind policy is supported. */ + unsigned char bind_membind; + /** Interleave policy is supported. */ + unsigned char interleave_membind; + /** Next-touch migration policy is supported. */ + unsigned char nexttouch_membind; + /** Migration flags is supported. */ + unsigned char migrate_membind; + /** Getting the last NUMA nodes where a memory area was allocated is supported */ + unsigned char get_area_memlocation; +}; + +/** \brief Flags describing miscellaneous features. + */ +struct hwloc_topology_misc_support { + /** Support was imported when importing another topology, see ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT. */ + unsigned char imported_support; +}; + +/** \brief Set of flags describing actual support for this topology. + * + * This is retrieved with hwloc_topology_get_support() and will be valid until + * the topology object is destroyed. Note: the values are correct only after + * discovery. + */ +struct hwloc_topology_support { + struct hwloc_topology_discovery_support *discovery; + struct hwloc_topology_cpubind_support *cpubind; + struct hwloc_topology_membind_support *membind; + struct hwloc_topology_misc_support *misc; +}; + +/** \brief Retrieve the topology support. + * + * Each flag indicates whether a feature is supported. + * If set to 0, the feature is not supported. + * If set to 1, the feature is supported, but the corresponding + * call may still fail in some corner cases. + * + * These features are also listed by hwloc-info \--support + * + * The reported features are what the current topology supports + * on the current machine. If the topology was exported to XML + * from another machine and later imported here, support still + * describes what is supported for this imported topology after + * import. By default, binding will be reported as unsupported + * in this case (see ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM). + * + * Topology flag ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT may be used + * to report the supported features of the original remote machine + * instead. If it was successfully imported, \p imported_support + * will be set in the struct hwloc_topology_misc_support array. + */ +HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology); + +/** \brief Type filtering flags. + * + * By default, most objects are kept (::HWLOC_TYPE_FILTER_KEEP_ALL). + * Instruction caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE). + * Die and Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE). + * + * Note that group objects are also ignored individually (without the entire level) + * when they do not bring structure. + */ +enum hwloc_type_filter_e { + /** \brief Keep all objects of this type. + * + * Cannot be set for ::HWLOC_OBJ_GROUP (groups are designed only to add more structure to the topology). + * \hideinitializer + */ + HWLOC_TYPE_FILTER_KEEP_ALL = 0, + + /** \brief Ignore all objects of this type. + * + * The bottom-level type ::HWLOC_OBJ_PU, the ::HWLOC_OBJ_NUMANODE type, and + * the top-level type ::HWLOC_OBJ_MACHINE may not be ignored. + * \hideinitializer + */ + HWLOC_TYPE_FILTER_KEEP_NONE = 1, + + /** \brief Only ignore objects if their entire level does not bring any structure. + * + * Keep the entire level of objects if at least one of these objects adds + * structure to the topology. An object brings structure when it has multiple + * children and it is not the only child of its parent. + * + * If all objects in the level are the only child of their parent, and if none + * of them has multiple children, the entire level is removed. + * + * Cannot be set for I/O and Misc objects since the topology structure does not matter there. + * \hideinitializer + */ + HWLOC_TYPE_FILTER_KEEP_STRUCTURE = 2, + + /** \brief Only keep likely-important objects of the given type. + * + * It is only useful for I/O object types. + * For ::HWLOC_OBJ_PCI_DEVICE and ::HWLOC_OBJ_OS_DEVICE, it means that only objects + * of major/common kinds are kept (storage, network, OpenFabrics, CUDA, + * OpenCL, RSMI, NVML, and displays). + * Also, only OS devices directly attached on PCI (e.g. no USB) are reported. + * For ::HWLOC_OBJ_BRIDGE, it means that bridges are kept only if they have children. + * + * This flag equivalent to ::HWLOC_TYPE_FILTER_KEEP_ALL for Normal, Memory and Misc types + * since they are likely important. + * \hideinitializer + */ + HWLOC_TYPE_FILTER_KEEP_IMPORTANT = 3 +}; + +/** \brief Set the filtering for the given object type. + */ +HWLOC_DECLSPEC int hwloc_topology_set_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter); + +/** \brief Get the current filtering for the given object type. + */ +HWLOC_DECLSPEC int hwloc_topology_get_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filter); + +/** \brief Set the filtering for all object types. + * + * If some types do not support this filtering, they are silently ignored. + */ +HWLOC_DECLSPEC int hwloc_topology_set_all_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter); + +/** \brief Set the filtering for all CPU cache object types. + * + * Memory-side caches are not involved since they are not CPU caches. + */ +HWLOC_DECLSPEC int hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter); + +/** \brief Set the filtering for all CPU instruction cache object types. + * + * Memory-side caches are not involved since they are not CPU caches. + */ +HWLOC_DECLSPEC int hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter); + +/** \brief Set the filtering for all I/O object types. + */ +HWLOC_DECLSPEC int hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter); + +/** \brief Set the topology-specific userdata pointer. + * + * Each topology may store one application-given private data pointer. + * It is initialized to \c NULL. + * hwloc will never modify it. + * + * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy(). + * + * This pointer is not exported to XML. + */ +HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata); + +/** \brief Retrieve the topology-specific userdata pointer. + * + * Retrieve the application-given private data pointer that was + * previously set with hwloc_topology_set_userdata(). + */ +HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology); + +/** @} */ + + + +/** \defgroup hwlocality_tinker Modifying a loaded Topology + * @{ + */ + +/** \brief Flags to be given to hwloc_topology_restrict(). */ +enum hwloc_restrict_flags_e { + /** \brief Remove all objects that became CPU-less. + * By default, only objects that contain no PU and no memory are removed. + * This flag may not be used with ::HWLOC_RESTRICT_FLAG_BYNODESET. + * \hideinitializer + */ + HWLOC_RESTRICT_FLAG_REMOVE_CPULESS = (1UL<<0), + + /** \brief Restrict by nodeset instead of CPU set. + * Only keep objects whose nodeset is included or partially included in the given set. + * This flag may not be used with ::HWLOC_RESTRICT_FLAG_REMOVE_CPULESS. + */ + HWLOC_RESTRICT_FLAG_BYNODESET = (1UL<<3), + + /** \brief Remove all objects that became Memory-less. + * By default, only objects that contain no PU and no memory are removed. + * This flag may only be used with ::HWLOC_RESTRICT_FLAG_BYNODESET. + * \hideinitializer + */ + HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS = (1UL<<4), + + /** \brief Move Misc objects to ancestors if their parents are removed during restriction. + * If this flag is not set, Misc objects are removed when their parents are removed. + * \hideinitializer + */ + HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1UL<<1), + + /** \brief Move I/O objects to ancestors if their parents are removed during restriction. + * If this flag is not set, I/O devices and bridges are removed when their parents are removed. + * \hideinitializer + */ + HWLOC_RESTRICT_FLAG_ADAPT_IO = (1UL<<2) +}; + +/** \brief Restrict the topology to the given CPU set or nodeset. + * + * Topology \p topology is modified so as to remove all objects that + * are not included (or partially included) in the CPU set \p set. + * All objects CPU and node sets are restricted accordingly. + * + * If ::HWLOC_RESTRICT_FLAG_BYNODESET is passed in \p flags, + * \p set is considered a nodeset instead of a CPU set. + * + * \p flags is a OR'ed set of ::hwloc_restrict_flags_e. + * + * \note This call may not be reverted by restricting back to a larger + * set. Once dropped during restriction, objects may not be brought + * back, except by loading another topology with hwloc_topology_load(). + * + * \return 0 on success. + * + * \return -1 with errno set to EINVAL if the input set is invalid. + * The topology is not modified in this case. + * + * \return -1 with errno set to ENOMEM on failure to allocate internal data. + * The topology is reinitialized in this case. It should be either + * destroyed with hwloc_topology_destroy() or configured and loaded again. + */ +HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_bitmap_t set, unsigned long flags); + +/** \brief Flags to be given to hwloc_topology_allow(). */ +enum hwloc_allow_flags_e { + /** \brief Mark all objects as allowed in the topology. + * + * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL. + * \hideinitializer */ + HWLOC_ALLOW_FLAG_ALL = (1UL<<0), + + /** \brief Only allow objects that are available to the current process. + * + * The topology must have ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM so that the set + * of available resources can actually be retrieved from the operating system. + * + * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL. + * \hideinitializer */ + HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS = (1UL<<1), + + /** \brief Allow a custom set of objects, given to hwloc_topology_allow() as \p cpuset and/or \p nodeset parameters. + * \hideinitializer */ + HWLOC_ALLOW_FLAG_CUSTOM = (1UL<<2) +}; + +/** \brief Change the sets of allowed PUs and NUMA nodes in the topology. + * + * This function only works if the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED + * was set on the topology. It does not modify any object, it only changes + * the sets returned by hwloc_topology_get_allowed_cpuset() and + * hwloc_topology_get_allowed_nodeset(). + * + * It is notably useful when importing a topology from another process + * running in a different Linux Cgroup. + * + * \p flags must be set to one flag among ::hwloc_allow_flags_e. + * + * \note Removing objects from a topology should rather be performed with + * hwloc_topology_restrict(). + */ +HWLOC_DECLSPEC int hwloc_topology_allow(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, hwloc_const_nodeset_t nodeset, unsigned long flags); + +/** \brief Add a MISC object as a leaf of the topology + * + * A new MISC object will be created and inserted into the topology at the + * position given by parent. It is appended to the list of existing Misc children, + * without ever adding any intermediate hierarchy level. This is useful for + * annotating the topology without actually changing the hierarchy. + * + * \p name is supposed to be unique across all Misc objects in the topology. + * It will be duplicated to setup the new object attributes. + * + * The new leaf object will not have any \p cpuset. + * + * \return the newly-created object + * + * \return \c NULL on error. + * + * \return \c NULL if Misc objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE). + * + * \note If \p name contains some non-printable characters, they will + * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h. + */ +HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name); + +/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object(). + * + * This function returns a new Group object. + * + * The caller should (at least) initialize its sets before inserting + * the object in the topology. See hwloc_topology_insert_group_object(). + */ +HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology); + +/** \brief Add more structure to the topology by adding an intermediate Group + * + * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object(). + * Then it must setup at least one of its CPU or node sets to specify + * the final location of the Group in the topology. + * Then the object can be passed to this function for actual insertion in the topology. + * + * Either the cpuset or nodeset field (or both, if compatible) must be set + * to a non-empty bitmap. The complete_cpuset or complete_nodeset may be set + * instead if inserting with respect to the complete topology + * (including disallowed, offline or unknown objects). + * If grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way + * to build the Group sets iteratively. + * These sets cannot be larger than the current topology, or they would get + * restricted silently. + * The core will setup the other sets after actual insertion. + * + * The \p subtype object attribute may be defined (to a dynamically + * allocated string) to display something else than "Group" as the + * type name for this object in lstopo. + * Custom name/value info pairs may be added with hwloc_obj_add_info() after + * insertion. + * + * The group \p dont_merge attribute may be set to \c 1 to prevent + * the hwloc core from ever merging this object with another + * hierarchically-identical object. + * This is useful when the Group itself describes an important feature + * that cannot be exposed anywhere else in the hierarchy. + * + * The group \p kind attribute may be set to a high value such + * as \c 0xffffffff to tell hwloc that this new Group should always + * be discarded in favor of any existing Group with the same locality. + * + * \return The inserted object if it was properly inserted. + * + * \return An existing object if the Group was merged or discarded + * because the topology already contained an object at the same + * location (the Group did not add any hierarchy information). + * + * \return \c NULL if the insertion failed because of conflicting sets in topology tree. + * + * \return \c NULL if Group objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE). + * + * \return \c NULL if the object was discarded because no set was + * initialized in the Group before insert, or all of them were empty. + */ +HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group); + +/** \brief Setup object cpusets/nodesets by OR'ing another object's sets. + * + * For each defined cpuset or nodeset in \p src, allocate the corresponding set + * in \p dst and add \p src to it by OR'ing sets. + * + * This function is convenient between hwloc_topology_alloc_group_object() + * and hwloc_topology_insert_group_object(). It builds the sets of the new Group + * that will be inserted as a new intermediate parent of several objects. + */ +HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src); + +/** \brief Refresh internal structures after topology modification. + * + * Modifying the topology (by restricting, adding objects, modifying structures + * such as distances or memory attributes, etc.) may cause some internal caches + * to become invalid. These caches are automatically refreshed when accessed + * but this refreshing is not thread-safe. + * + * This function is not thread-safe either, but it is a good way to end a + * non-thread-safe phase of topology modification. Once this refresh is done, + * multiple threads may concurrently consult the topology, objects, distances, + * attributes, etc. + * + * See also \ref threadsafety + */ +HWLOC_DECLSPEC int hwloc_topology_refresh(hwloc_topology_t topology); + +/** @} */ + + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +/* high-level helpers */ +#include "hwloc/helper.h" + +/* inline code of some functions above */ +#include "hwloc/inlines.h" + +/* memory attributes */ +#include "hwloc/memattrs.h" + +/* kinds of CPU cores */ +#include "hwloc/cpukinds.h" + +/* exporting to XML or synthetic */ +#include "hwloc/export.h" + +/* distances */ +#include "hwloc/distances.h" + +/* topology diffs */ +#include "hwloc/diff.h" + +/* deprecated headers */ +#include "hwloc/deprecated.h" + +#endif /* HWLOC_H */ diff --git a/deps/hwloc/include/hwloc/autogen/config.h b/deps/hwloc/include/hwloc/autogen/config.h new file mode 100644 index 000000000..951fec8c2 --- /dev/null +++ b/deps/hwloc/include/hwloc/autogen/config.h @@ -0,0 +1,233 @@ +/* include/hwloc/autogen/config.h. Generated from config.h.in by configure. */ +/* -*- c -*- + * Copyright © 2009 CNRS + * Copyright © 2009-2020 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/* The configuration file */ + +#ifndef HWLOC_CONFIG_H +#define HWLOC_CONFIG_H + +#define HWLOC_VERSION "2.5.0a1-git" +#define HWLOC_VERSION_MAJOR 2 +#define HWLOC_VERSION_MINOR 5 +#define HWLOC_VERSION_RELEASE 0 +#define HWLOC_VERSION_GREEK "a1" + +#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)) +# define __hwloc_restrict __restrict +#else +# if __STDC_VERSION__ >= 199901L +# define __hwloc_restrict restrict +# else +# define __hwloc_restrict +# endif +#endif + +/* Note that if we're compiling C++, then just use the "inline" + keyword, since it's part of C++ */ +#if defined(c_plusplus) || defined(__cplusplus) +# define __hwloc_inline inline +#elif defined(_MSC_VER) || defined(__HP_cc) +# define __hwloc_inline __inline +#else +# define __hwloc_inline __inline__ +#endif + +/* + * Note: this is public. We can not assume anything from the compiler used + * by the application and thus the HWLOC_HAVE_* macros below are not + * fetched from the autoconf result here. We only automatically use a few + * well-known easy cases. + */ + +/* Some handy constants to make the logic below a little more readable */ +#if defined(__cplusplus) && \ + (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4)) +#define GXX_ABOVE_3_4 1 +#else +#define GXX_ABOVE_3_4 0 +#endif + +#if !defined(__cplusplus) && \ + (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)) +#define GCC_ABOVE_2_95 1 +#else +#define GCC_ABOVE_2_95 0 +#endif + +#if !defined(__cplusplus) && \ + (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)) +#define GCC_ABOVE_2_96 1 +#else +#define GCC_ABOVE_2_96 0 +#endif + +#if !defined(__cplusplus) && \ + (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) +#define GCC_ABOVE_3_3 1 +#else +#define GCC_ABOVE_3_3 0 +#endif + +#if !defined(__cplusplus) && \ + (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +#define GCC_ABOVE_3_4 1 +#else +#define GCC_ABOVE_3_4 0 +#endif + +/* Maybe before gcc 2.95 too */ +#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED +#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95) +#else +# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_UNUSED +# define __hwloc_attribute_unused __attribute__((__unused__)) +#else +# define __hwloc_attribute_unused +#endif + +#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC +#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96) +#else +# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_MALLOC +# define __hwloc_attribute_malloc __attribute__((__malloc__)) +#else +# define __hwloc_attribute_malloc +#endif + +#ifdef HWLOC_HAVE_ATTRIBUTE_CONST +#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95) +#else +# define __HWLOC_HAVE_ATTRIBUTE_CONST 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_CONST +# define __hwloc_attribute_const __attribute__((__const__)) +#else +# define __hwloc_attribute_const +#endif + +#ifdef HWLOC_HAVE_ATTRIBUTE_PURE +#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96) +#else +# define __HWLOC_HAVE_ATTRIBUTE_PURE 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_PURE +# define __hwloc_attribute_pure __attribute__((__pure__)) +#else +# define __hwloc_attribute_pure +#endif + +#ifndef __hwloc_attribute_deprecated /* allow the user to disable these warnings by defining this macro to nothing */ +#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED +#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3) +#else +# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED +# define __hwloc_attribute_deprecated __attribute__((__deprecated__)) +#else +# define __hwloc_attribute_deprecated +#endif +#endif + +#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS +#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3) +#else +# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS +# define __hwloc_attribute_may_alias __attribute__((__may_alias__)) +#else +# define __hwloc_attribute_may_alias +#endif + +#ifdef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT +#define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT +#elif defined(__GNUC__) +# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT (GXX_ABOVE_3_4 || GCC_ABOVE_3_4) +#else +# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 0 +#endif +#if __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT +# define __hwloc_attribute_warn_unused_result __attribute__((__warn_unused_result__)) +#else +# define __hwloc_attribute_warn_unused_result +#endif + +#ifdef HWLOC_C_HAVE_VISIBILITY +# if HWLOC_C_HAVE_VISIBILITY +# define HWLOC_DECLSPEC __attribute__((__visibility__("default"))) +# else +# define HWLOC_DECLSPEC +# endif +#else +# define HWLOC_DECLSPEC +#endif + +/* Defined to 1 on Linux */ +#define HWLOC_LINUX_SYS 1 + +/* Defined to 1 if the CPU_SET macro works */ +#define HWLOC_HAVE_CPU_SET 1 + +/* Defined to 1 if you have the `windows.h' header. */ +/* #undef HWLOC_HAVE_WINDOWS_H */ +#define hwloc_pid_t pid_t +#define hwloc_thread_t pthread_t + +#ifdef HWLOC_HAVE_WINDOWS_H + +# include <windows.h> +typedef DWORDLONG hwloc_uint64_t; + +#else /* HWLOC_HAVE_WINDOWS_H */ + +# ifdef hwloc_thread_t +# include <pthread.h> +# endif /* hwloc_thread_t */ + +/* Defined to 1 if you have the <stdint.h> header file. */ +# define HWLOC_HAVE_STDINT_H 1 + +# include <unistd.h> +# ifdef HWLOC_HAVE_STDINT_H +# include <stdint.h> +# endif +typedef uint64_t hwloc_uint64_t; + +#endif /* HWLOC_HAVE_WINDOWS_H */ + +/* Define to 1 if --enable-32bits-pci-domain is called. */ +/* #undef HWLOC_HAVE_32BITS_PCI_DOMAIN */ + +/* Whether we need to re-define all the hwloc public symbols or not */ +#define HWLOC_SYM_TRANSFORM 0 + +/* The hwloc symbol prefix */ +#define HWLOC_SYM_PREFIX hwloc_ + +/* The hwloc symbol prefix in all caps */ +#define HWLOC_SYM_PREFIX_CAPS HWLOC_ + +#endif /* HWLOC_CONFIG_H */ diff --git a/deps/hwloc/include/hwloc/bitmap.h b/deps/hwloc/include/hwloc/bitmap.h new file mode 100644 index 000000000..8d9bb9c88 --- /dev/null +++ b/deps/hwloc/include/hwloc/bitmap.h @@ -0,0 +1,494 @@ +/* + * Copyright © 2009 CNRS + * Copyright © 2009-2020 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief The bitmap API, for use in hwloc itself. + */ + +#ifndef HWLOC_BITMAP_H +#define HWLOC_BITMAP_H + +#include "hwloc/autogen/config.h" + +#include <assert.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/** \defgroup hwlocality_bitmap The bitmap API + * + * The ::hwloc_bitmap_t type represents a set of integers (positive or null). + * A bitmap may be of infinite size (all bits are set after some point). + * A bitmap may even be full if all bits are set. + * + * Bitmaps are used by hwloc for sets of OS processors + * (which may actually be hardware threads) as by ::hwloc_cpuset_t + * (a typedef for ::hwloc_bitmap_t), or sets of NUMA memory nodes + * as ::hwloc_nodeset_t (also a typedef for ::hwloc_bitmap_t). + * Those are used for cpuset and nodeset fields in the ::hwloc_obj structure, + * see \ref hwlocality_object_sets. + * + * <em>Both CPU and node sets are always indexed by OS physical number.</em> + * However users should usually not build CPU and node sets manually + * (e.g. with hwloc_bitmap_set()). + * One should rather use existing object sets and combine them with + * hwloc_bitmap_or(), etc. + * For instance, binding the current thread on a pair of cores may be performed with: + * \code + * hwloc_obj_t core1 = ... , core2 = ... ; + * hwloc_bitmap_t set = hwloc_bitmap_alloc(); + * hwloc_bitmap_or(set, core1->cpuset, core2->cpuset); + * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD); + * hwloc_bitmap_free(set); + * \endcode + * + * \note Most functions below return an int that may be negative in case of + * error. The usual error case would be an internal failure to realloc/extend + * the storage of the bitmap (\p errno would be set to \c ENOMEM). + * + * \note Several examples of using the bitmap API are available under the + * doc/examples/ directory in the source tree. + * Regression tests such as tests/hwloc/hwloc_bitmap*.c also make intensive use + * of this API. + * @{ + */ + + +/** \brief + * Set of bits represented as an opaque pointer to an internal bitmap. + */ +typedef struct hwloc_bitmap_s * hwloc_bitmap_t; +/** \brief a non-modifiable ::hwloc_bitmap_t */ +typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t; + + +/* + * Bitmap allocation, freeing and copying. + */ + +/** \brief Allocate a new empty bitmap. + * + * \returns A valid bitmap or \c NULL. + * + * The bitmap should be freed by a corresponding call to + * hwloc_bitmap_free(). + */ +HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc; + +/** \brief Allocate a new full bitmap. */ +HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc; + +/** \brief Free bitmap \p bitmap. + * + * If \p bitmap is \c NULL, no operation is performed. + */ +HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap); + +/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents. + * + * If \p bitmap is \c NULL, \c NULL is returned. + */ +HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc; + +/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */ +HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src); + + +/* + * Bitmap/String Conversion + */ + +/** \brief Stringify a bitmap. + * + * Up to \p buflen characters may be written in buffer \p buf. + * + * If \p buflen is 0, \p buf may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap); + +/** \brief Stringify a bitmap into a newly allocated string. + * + * \return -1 on error. + */ +HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap); + +/** \brief Parse a bitmap string and stores it in bitmap \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string); + +/** \brief Stringify a bitmap in the list format. + * + * Lists are comma-separated indexes or ranges. + * Ranges are dash separated indexes. + * The last range may not have an ending indexes if the bitmap is infinitely set. + * + * Up to \p buflen characters may be written in buffer \p buf. + * + * If \p buflen is 0, \p buf may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap); + +/** \brief Stringify a bitmap into a newly allocated list string. + * + * \return -1 on error. + */ +HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap); + +/** \brief Parse a list string and stores it in bitmap \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string); + +/** \brief Stringify a bitmap in the taskset-specific format. + * + * The taskset command manipulates bitmap strings that contain a single + * (possible very long) hexadecimal number starting with 0x. + * + * Up to \p buflen characters may be written in buffer \p buf. + * + * If \p buflen is 0, \p buf may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap); + +/** \brief Stringify a bitmap into a newly allocated taskset-specific string. + * + * \return -1 on error. + */ +HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap); + +/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string); + + +/* + * Building bitmaps. + */ + +/** \brief Empty the bitmap \p bitmap */ +HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap); + +/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */ +HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap); + +/** \brief Empty the bitmap \p bitmap and add bit \p id */ +HWLOC_DECLSPEC int hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id); + +/** \brief Fill the bitmap \p and clear the index \p id */ +HWLOC_DECLSPEC int hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id); + +/** \brief Setup bitmap \p bitmap from unsigned long \p mask */ +HWLOC_DECLSPEC int hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask); + +/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */ +HWLOC_DECLSPEC int hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask); + +/** \brief Setup bitmap \p bitmap from unsigned longs \p masks used as first \p nr subsets */ +HWLOC_DECLSPEC int hwloc_bitmap_from_ulongs(hwloc_bitmap_t bitmap, unsigned nr, const unsigned long *masks); + + +/* + * Modifying bitmaps. + */ + +/** \brief Add index \p id in bitmap \p bitmap */ +HWLOC_DECLSPEC int hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id); + +/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap. + * + * If \p end is \c -1, the range is infinite. + */ +HWLOC_DECLSPEC int hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end); + +/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */ +HWLOC_DECLSPEC int hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask); + +/** \brief Remove index \p id from bitmap \p bitmap */ +HWLOC_DECLSPEC int hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id); + +/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap. + * + * If \p end is \c -1, the range is infinite. + */ +HWLOC_DECLSPEC int hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end); + +/** \brief Keep a single index among those set in bitmap \p bitmap + * + * May be useful before binding so that the process does not + * have a chance of migrating between multiple processors + * in the original mask. + * Instead of running the task on any PU inside the given CPU set, + * the operating system scheduler will be forced to run it on a single + * of these PUs. + * It avoids a migration overhead and cache-line ping-pongs between PUs. + * + * \note This function is NOT meant to distribute multiple processes + * within a single CPU set. It always return the same single bit when + * called multiple times on the same input set. hwloc_distrib() may + * be used for generating CPU sets to distribute multiple tasks below + * a single multi-PU object. + * + * \note This function cannot be applied to an object set directly. It + * should be applied to a copy (which may be obtained with hwloc_bitmap_dup()). + */ +HWLOC_DECLSPEC int hwloc_bitmap_singlify(hwloc_bitmap_t bitmap); + + +/* + * Consulting bitmaps. + */ + +/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */ +HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */ +HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure; + +/** \brief Convert the first \p nr subsets of bitmap \p bitmap into the array of \p nr unsigned long \p masks + * + * \p nr may be determined earlier with hwloc_bitmap_nr_ulongs(). + * + * \return 0 + */ +HWLOC_DECLSPEC int hwloc_bitmap_to_ulongs(hwloc_const_bitmap_t bitmap, unsigned nr, unsigned long *masks); + +/** \brief Return the number of unsigned longs required for storing bitmap \p bitmap entirely + * + * This is the number of contiguous unsigned longs from the very first bit of the bitmap + * (even if unset) up to the last set bit. + * This is useful for knowing the \p nr parameter to pass to hwloc_bitmap_to_ulongs() + * (or which calls to hwloc_bitmap_to_ith_ulong() are needed) + * to entirely convert a bitmap into multiple unsigned longs. + * + * When called on the output of hwloc_topology_get_topology_cpuset(), + * the returned number is large enough for all cpusets of the topology. + * + * \return -1 if \p bitmap is infinite. + */ +HWLOC_DECLSPEC int hwloc_bitmap_nr_ulongs(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Test whether index \p id is part of bitmap \p bitmap. + * + * \return 1 if the bit at index \p id is set in bitmap \p bitmap, 0 otherwise. + */ +HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure; + +/** \brief Test whether bitmap \p bitmap is empty + * + * \return 1 if bitmap is empty, 0 otherwise. + */ +HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Test whether bitmap \p bitmap is completely full + * + * \return 1 if bitmap is full, 0 otherwise. + * + * \note A full bitmap is always infinitely set. + */ +HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Compute the first index (least significant bit) in bitmap \p bitmap + * + * \return -1 if no index is set in \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev + * + * If \p prev is -1, the first index is returned. + * + * \return -1 if no index with higher index is set in \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure; + +/** \brief Compute the last index (most significant bit) in bitmap \p bitmap + * + * \return -1 if no index is set in \p bitmap, or if \p bitmap is infinitely set. + */ +HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of + * indexes that are in the bitmap). + * + * \return the number of indexes that are in the bitmap. + * + * \return -1 if \p bitmap is infinitely set. + */ +HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Compute the first unset index (least significant bit) in bitmap \p bitmap + * + * \return -1 if no index is unset in \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_first_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Compute the next unset index in bitmap \p bitmap which is after index \p prev + * + * If \p prev is -1, the first unset index is returned. + * + * \return -1 if no index with higher index is unset in \p bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_next_unset(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure; + +/** \brief Compute the last unset index (most significant bit) in bitmap \p bitmap + * + * \return -1 if no index is unset in \p bitmap, or if \p bitmap is infinitely set. + */ +HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure; + +/** \brief Loop macro iterating on bitmap \p bitmap + * + * The loop must start with hwloc_bitmap_foreach_begin() and end + * with hwloc_bitmap_foreach_end() followed by a terminating ';'. + * + * \p index is the loop variable; it should be an unsigned int. The + * first iteration will set \p index to the lowest index in the bitmap. + * Successive iterations will iterate through, in order, all remaining + * indexes set in the bitmap. To be specific: each iteration will return a + * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true. + * + * The assert prevents the loop from being infinite if the bitmap is infinitely set. + * + * \hideinitializer + */ +#define hwloc_bitmap_foreach_begin(id, bitmap) \ +do { \ + assert(hwloc_bitmap_weight(bitmap) != -1); \ + for (id = hwloc_bitmap_first(bitmap); \ + (unsigned) id != (unsigned) -1; \ + id = hwloc_bitmap_next(bitmap, id)) { + +/** \brief End of loop macro iterating on a bitmap. + * + * Needs a terminating ';'. + * + * \sa hwloc_bitmap_foreach_begin() + * \hideinitializer + */ +#define hwloc_bitmap_foreach_end() \ + } \ +} while (0) + + +/* + * Combining bitmaps. + */ + +/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res + * + * \p res can be the same as \p bitmap1 or \p bitmap2 + */ +HWLOC_DECLSPEC int hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2); + +/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res + * + * \p res can be the same as \p bitmap1 or \p bitmap2 + */ +HWLOC_DECLSPEC int hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2); + +/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res + * + * \p res can be the same as \p bitmap1 or \p bitmap2 + */ +HWLOC_DECLSPEC int hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2); + +/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res + * + * \p res can be the same as \p bitmap1 or \p bitmap2 + */ +HWLOC_DECLSPEC int hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2); + +/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res + * + * \p res can be the same as \p bitmap + */ +HWLOC_DECLSPEC int hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap); + + +/* + * Comparing bitmaps. + */ + +/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects. + * + * \return 1 if bitmaps intersect, 0 otherwise. + */ +HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure; + +/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap. + * + * \return 1 if \p sub_bitmap is included in \p super_bitmap, 0 otherwise. + * + * \note The empty bitmap is considered included in any other bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure; + +/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2. + * + * \return 1 if bitmaps are equal, 0 otherwise. + */ +HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure; + +/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index. + * + * A bitmap is considered smaller if its least significant bit is smaller. + * The empty bitmap is considered higher than anything (because its least significant bit does not exist). + * + * \return -1 if \p bitmap1 is considered smaller than \p bitmap2. + * \return 1 if \p bitmap1 is considered larger than \p bitmap2. + * + * For instance comparing binary bitmaps 0011 and 0110 returns -1 + * (hence 0011 is considered smaller than 0110) + * because least significant bit of 0011 (0001) is smaller than least significant bit of 0110 (0010). + * Comparing 01001 and 00110 would also return -1 for the same reason. + * + * \return 0 if bitmaps are considered equal, even if they are not strictly equal. + * They just need to have the same least significant bit. + * For instance, comparing binary bitmaps 0010 and 0110 returns 0 because they have the same least significant bit. + */ +HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure; + +/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order. + * + * Lexicographic comparison of bitmaps, starting for their highest indexes. + * Compare last indexes first, then second, etc. + * The empty bitmap is considered lower than anything. + * + * \return -1 if \p bitmap1 is considered smaller than \p bitmap2. + * \return 1 if \p bitmap1 is considered larger than \p bitmap2. + * \return 0 if bitmaps are equal (contrary to hwloc_bitmap_compare_first()). + * + * For instance comparing binary bitmaps 0011 and 0110 returns -1 + * (hence 0011 is considered smaller than 0110). + * Comparing 00101 and 01010 returns -1 too. + * + * \note This is different from the non-existing hwloc_bitmap_compare_last() + * which would only compare the highest index of each bitmap. + */ +HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure; + +/** @} */ + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_BITMAP_H */ diff --git a/deps/hwloc/include/hwloc/cpukinds.h b/deps/hwloc/include/hwloc/cpukinds.h new file mode 100644 index 000000000..f240baf39 --- /dev/null +++ b/deps/hwloc/include/hwloc/cpukinds.h @@ -0,0 +1,188 @@ +/* + * Copyright © 2020 Inria. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief Kinds of CPU cores. + */ + +#ifndef HWLOC_CPUKINDS_H +#define HWLOC_CPUKINDS_H + +#include "hwloc.h" + +#ifdef __cplusplus +extern "C" { +#elif 0 +} +#endif + +/** \defgroup hwlocality_cpukinds Kinds of CPU cores + * + * Platforms with heterogeneous CPUs may have some cores with + * different features or frequencies. + * This API exposes identical PUs in sets called CPU kinds. + * Each PU of the topology may only be in a single kind. + * + * The number of kinds may be obtained with hwloc_cpukinds_get_nr(). + * If the platform is homogeneous, there may be a single kind + * with all PUs. + * If the platform or operating system does not expose any + * information about CPU cores, there may be no kind at all. + * + * The index of the kind that describes a given CPU set + * (if any, and not partially) + * may be obtained with hwloc_cpukinds_get_by_cpuset(). + * + * From the index of a kind, it is possible to retrieve information + * with hwloc_cpukinds_get_info(): + * an abstracted efficiency value, + * and an array of info attributes + * (for instance the "CoreType" and "FrequencyMaxMHz", + * see \ref topoattrs_cpukinds). + * + * A higher efficiency value means intrinsic greater performance + * (and possibly less performance/power efficiency). + * Kinds with lower efficiency are ranked first: + * Passing 0 as \p kind_index to hwloc_cpukinds_get_info() will + * return information about the less efficient CPU kind. + * + * When available, efficiency values are gathered from the operating + * system (when \p cpukind_efficiency is set in the + * struct hwloc_topology_discovery_support array, only on Windows 10 for now). + * Otherwise hwloc tries to compute efficiencies + * by comparing CPU kinds using frequencies (on ARM), + * or core types and frequencies (on other architectures). + * The environment variable HWLOC_CPUKINDS_RANKING may be used + * to change this heuristics, see \ref envvar. + * + * If hwloc fails to rank any kind, for instance because the operating + * system does not expose efficiencies and core frequencies, + * all kinds will have an unknown efficiency (\c -1), + * and they are not indexed/ordered in any specific way. + * + * @{ + */ + +/** \brief Get the number of different kinds of CPU cores in the topology. + * + * \p flags must be \c 0 for now. + * + * \return The number of CPU kinds (positive integer) on success. + * \return \c 0 if no information about kinds was found. + * \return \c -1 with \p errno set to \c EINVAL if \p flags is invalid. + */ +HWLOC_DECLSPEC int +hwloc_cpukinds_get_nr(hwloc_topology_t topology, + unsigned long flags); + +/** \brief Get the index of the CPU kind that contains CPUs listed in \p cpuset. + * + * \p flags must be \c 0 for now. + * + * \return The index of the CPU kind (positive integer or 0) on success. + * \return \c -1 with \p errno set to \c EXDEV if \p cpuset is + * only partially included in the some kind. + * \return \c -1 with \p errno set to \c ENOENT if \p cpuset is + * not included in any kind, even partially. + * \return \c -1 with \p errno set to \c EINVAL if parameters are invalid. + */ +HWLOC_DECLSPEC int +hwloc_cpukinds_get_by_cpuset(hwloc_topology_t topology, + hwloc_const_bitmap_t cpuset, + unsigned long flags); + +/** \brief Get the CPU set and infos about a CPU kind in the topology. + * + * \p kind_index identifies one kind of CPU between 0 and the number + * of kinds returned by hwloc_cpukinds_get_nr() minus 1. + * + * If not \c NULL, the bitmap \p cpuset will be filled with + * the set of PUs of this kind. + * + * The integer pointed by \p efficiency, if not \c NULL will, be filled + * with the ranking of this kind of CPU in term of efficiency (see above). + * It ranges from \c 0 to the number of kinds + * (as reported by hwloc_cpukinds_get_nr()) minus 1. + * + * Kinds with lower efficiency are reported first. + * + * If there is a single kind in the topology, its efficiency \c 0. + * If the efficiency of some kinds of cores is unknown, + * the efficiency of all kinds is set to \c -1, + * and kinds are reported in no specific order. + * + * The array of info attributes (for instance the "CoreType", + * "FrequencyMaxMHz" or "FrequencyBaseMHz", see \ref topoattrs_cpukinds) + * and its length are returned in \p infos or \p nr_infos. + * The array belongs to the topology, it should not be freed or modified. + * + * If \p nr_infos or \p infos is \c NULL, no info is returned. + * + * \p flags must be \c 0 for now. + * + * \return \c 0 on success. + * \return \c -1 with \p errno set to \c ENOENT if \p kind_index does not match any CPU kind. + * \return \c -1 with \p errno set to \c EINVAL if parameters are invalid. + */ +HWLOC_DECLSPEC int +hwloc_cpukinds_get_info(hwloc_topology_t topology, + unsigned kind_index, + hwloc_bitmap_t cpuset, + int *efficiency, + unsigned *nr_infos, struct hwloc_info_s **infos, + unsigned long flags); + +/** \brief Register a kind of CPU in the topology. + * + * Mark the PUs listed in \p cpuset as being of the same kind + * with respect to the given attributes. + * + * \p forced_efficiency should be \c -1 if unknown. + * Otherwise it is an abstracted efficiency value to enforce + * the ranking of all kinds if all of them have valid (and + * different) efficiencies. + * + * The array \p infos of size \p nr_infos may be used to provide + * info names and values describing this kind of PUs. + * + * \p flags must be \c 0 for now. + * + * Parameters \p cpuset and \p infos will be duplicated internally, + * the caller is responsible for freeing them. + * + * If \p cpuset overlaps with some existing kinds, those might get + * modified or split. For instance if existing kind A contains + * PUs 0 and 1, and one registers another kind for PU 1 and 2, + * there will be 3 resulting kinds: + * existing kind A is restricted to only PU 0; + * new kind B contains only PU 1 and combines information from A + * and from the newly-registered kind; + * new kind C contains only PU 2 and only gets information from + * the newly-registered kind. + * + * \note The efficiency \p forced_efficiency provided to this function + * may be different from the one reported later by hwloc_cpukinds_get_info() + * because hwloc will scale efficiency values down to + * between 0 and the number of kinds minus 1. + * + * \return \c 0 on success. + * \return \c -1 with \p errno set to \c EINVAL if some parameters are invalid, + * for instance if \p cpuset is \c NULL or empty. + */ +HWLOC_DECLSPEC int +hwloc_cpukinds_register(hwloc_topology_t topology, + hwloc_bitmap_t cpuset, + int forced_efficiency, + unsigned nr_infos, struct hwloc_info_s *infos, + unsigned long flags); + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_CPUKINDS_H */ diff --git a/deps/hwloc/include/hwloc/deprecated.h b/deps/hwloc/include/hwloc/deprecated.h new file mode 100644 index 000000000..f2419dd48 --- /dev/null +++ b/deps/hwloc/include/hwloc/deprecated.h @@ -0,0 +1,217 @@ +/* + * Copyright © 2009 CNRS + * Copyright © 2009-2021 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/** + * This file contains the inline code of functions declared in hwloc.h + */ + +#ifndef HWLOC_DEPRECATED_H +#define HWLOC_DEPRECATED_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* backward compat with v2.0 before WHOLE_SYSTEM renaming */ +#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED +/* backward compat with v1.11 before System removal */ +#define HWLOC_OBJ_SYSTEM HWLOC_OBJ_MACHINE +/* backward compat with v1.10 before Socket->Package renaming */ +#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE +/* backward compat with v1.10 before Node->NUMANode clarification */ +#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE + +/** \brief Add a distances structure. + * + * Superseded by hwloc_distances_add_create()+hwloc_distances_add_values()+hwloc_distances_add_commit() + * in v2.5. + */ +HWLOC_DECLSPEC int hwloc_distances_add(hwloc_topology_t topology, + unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values, + unsigned long kind, unsigned long flags) __hwloc_attribute_deprecated; + +/** \brief Insert a misc object by parent. + * + * Identical to hwloc_topology_insert_misc_object(). + */ +static __hwloc_inline hwloc_obj_t +hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated; +static __hwloc_inline hwloc_obj_t +hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) +{ + return hwloc_topology_insert_misc_object(topology, parent, name); +} + +/** \brief Stringify the cpuset containing a set of objects. + * + * If \p size is 0, \p string may safely be \c NULL. + * + * \return the number of character that were actually written if not truncating, + * or that would have been written (not including the ending \\0). + */ +static __hwloc_inline int +hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs) +{ + hwloc_bitmap_t set = hwloc_bitmap_alloc(); + int res; + unsigned i; + + hwloc_bitmap_zero(set); + for(i=0; i<nobj; i++) + if (objs[i]->cpuset) + hwloc_bitmap_or(set, set, objs[i]->cpuset); + + res = hwloc_bitmap_snprintf(str, size, set); + hwloc_bitmap_free(set); + return res; +} + +/** \brief Convert a type string into a type and some attributes. + * + * Deprecated by hwloc_type_sscanf() + */ +static __hwloc_inline int +hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize) +{ + union hwloc_obj_attr_u attr; + int err = hwloc_type_sscanf(string, typep, &attr, sizeof(attr)); + if (err < 0) + return err; + if (hwloc_obj_type_is_cache(*typep)) { + if (depthattrp) + *depthattrp = (int) attr.cache.depth; + if (typeattrp && typeattrsize >= sizeof(hwloc_obj_cache_type_t)) + memcpy(typeattrp, &attr.cache.type, sizeof(hwloc_obj_cache_type_t)); + } else if (*typep == HWLOC_OBJ_GROUP) { + if (depthattrp) + *depthattrp = (int) attr.group.depth; + } + return 0; +} + +/** \brief Set the default memory binding policy of the current + * process or thread to prefer the NUMA node(s) specified by physical \p nodeset + */ +static __hwloc_inline int +hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ + return hwloc_set_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Query the default memory binding policy and physical locality of the + * current process or thread. + */ +static __hwloc_inline int +hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) +{ + return hwloc_get_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Set the default memory binding policy of the specified + * process to prefer the NUMA node(s) specified by physical \p nodeset + */ +static __hwloc_inline int +hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ + return hwloc_set_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Query the default memory binding policy and physical locality of the + * specified process. + */ +static __hwloc_inline int +hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) +{ + return hwloc_get_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Bind the already-allocated memory identified by (addr, len) + * to the NUMA node(s) in physical \p nodeset. + */ +static __hwloc_inline int +hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ + return hwloc_set_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Query the physical NUMA node(s) and binding policy of the memory + * identified by (\p addr, \p len ). + */ +static __hwloc_inline int +hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated; +static __hwloc_inline int +hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) +{ + return hwloc_get_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Allocate some memory on the given physical nodeset \p nodeset + */ +static __hwloc_inline void * +hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated; +static __hwloc_inline void * +hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ + return hwloc_alloc_membind(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Allocate some memory on the given nodeset \p nodeset. + */ +static __hwloc_inline void * +hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated; +static __hwloc_inline void * +hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ + return hwloc_alloc_membind_policy(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET); +} + +/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases + */ +static __hwloc_inline void +hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) __hwloc_attribute_deprecated; +static __hwloc_inline void +hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) +{ + hwloc_cpuset_to_nodeset(topology, _cpuset, nodeset); +} + +/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases + */ +static __hwloc_inline void +hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) __hwloc_attribute_deprecated; +static __hwloc_inline void +hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) +{ + hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset); +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_DEPRECATED_H */ diff --git a/deps/hwloc/include/hwloc/diff.h b/deps/hwloc/include/hwloc/diff.h new file mode 100644 index 000000000..0ad0486be --- /dev/null +++ b/deps/hwloc/include/hwloc/diff.h @@ -0,0 +1,289 @@ +/* + * Copyright © 2013-2020 Inria. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief Topology differences. + */ + +#ifndef HWLOC_DIFF_H +#define HWLOC_DIFF_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + + +#ifdef __cplusplus +extern "C" { +#elif 0 +} +#endif + + +/** \defgroup hwlocality_diff Topology differences + * + * Applications that manipulate many similar topologies, for instance + * one for each node of a homogeneous cluster, may want to compress + * topologies to reduce the memory footprint. + * + * This file offers a way to manipulate the difference between topologies + * and export/import it to/from XML. + * Compression may therefore be achieved by storing one topology + * entirely while the others are only described by their differences + * with the former. + * The actual topology can be reconstructed when actually needed by + * applying the precomputed difference to the reference topology. + * + * This interface targets very similar nodes. + * Only very simple differences between topologies are actually + * supported, for instance a change in the memory size, the name + * of the object, or some info attribute. + * More complex differences such as adding or removing objects cannot + * be represented in the difference structures and therefore return + * errors. + * Differences between object sets or topology-wide allowed sets, + * cannot be represented either. + * + * It means that there is no need to apply the difference when + * looking at the tree organization (how many levels, how many + * objects per level, what kind of objects, CPU and node sets, etc) + * and when binding to objects. + * However the difference must be applied when looking at object + * attributes such as the name, the memory size or info attributes. + * + * @{ + */ + + +/** \brief Type of one object attribute difference. + */ +typedef enum hwloc_topology_diff_obj_attr_type_e { + /** \brief The object local memory is modified. + * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_uint64_s + * (and the index field is ignored). + */ + HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE, + + /** \brief The object name is modified. + * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s + * (and the name field is ignored). + */ + + HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME, + /** \brief the value of an info attribute is modified. + * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s. + */ + HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO +} hwloc_topology_diff_obj_attr_type_t; + +/** \brief One object attribute difference. + */ +union hwloc_topology_diff_obj_attr_u { + struct hwloc_topology_diff_obj_attr_generic_s { + /* each part of the union must start with these */ + hwloc_topology_diff_obj_attr_type_t type; + } generic; + + /** \brief Integer attribute modification with an optional index. */ + struct hwloc_topology_diff_obj_attr_uint64_s { + /* used for storing integer attributes */ + hwloc_topology_diff_obj_attr_type_t type; + hwloc_uint64_t index; /* not used for SIZE */ + hwloc_uint64_t oldvalue; + hwloc_uint64_t newvalue; + } uint64; + + /** \brief String attribute modification with an optional name */ + struct hwloc_topology_diff_obj_attr_string_s { + /* used for storing name and info pairs */ + hwloc_topology_diff_obj_attr_type_t type; + char *name; /* not used for NAME */ + char *oldvalue; + char *newvalue; + } string; +}; + + +/** \brief Type of one element of a difference list. + */ +typedef enum hwloc_topology_diff_type_e { + /** \brief An object attribute was changed. + * The union is a hwloc_topology_diff_u::hwloc_topology_diff_obj_attr_s. + */ + HWLOC_TOPOLOGY_DIFF_OBJ_ATTR, + + /** \brief The difference is too complex, + * it cannot be represented. The difference below + * this object has not been checked. + * hwloc_topology_diff_build() will return 1. + * + * The union is a hwloc_topology_diff_u::hwloc_topology_diff_too_complex_s. + */ + HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX +} hwloc_topology_diff_type_t; + +/** \brief One element of a difference list between two topologies. + */ +typedef union hwloc_topology_diff_u { + struct hwloc_topology_diff_generic_s { + /* each part of the union must start with these */ + hwloc_topology_diff_type_t type; + union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */ + } generic; + + /* A difference in an object attribute. */ + struct hwloc_topology_diff_obj_attr_s { + hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */ + union hwloc_topology_diff_u * next; + /* List of attribute differences for a single object */ + int obj_depth; + unsigned obj_index; + union hwloc_topology_diff_obj_attr_u diff; + } obj_attr; + + /* A difference that is too complex. */ + struct hwloc_topology_diff_too_complex_s { + hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */ + union hwloc_topology_diff_u * next; + /* Where we had to stop computing the diff in the first topology */ + int obj_depth; + unsigned obj_index; + } too_complex; +} * hwloc_topology_diff_t; + + +/** \brief Compute the difference between 2 topologies. + * + * The difference is stored as a list of ::hwloc_topology_diff_t entries + * starting at \p diff. + * It is computed by doing a depth-first traversal of both topology trees + * simultaneously. + * + * If the difference between 2 objects is too complex to be represented + * (for instance if some objects have different types, or different numbers + * of children), a special diff entry of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX + * is queued. + * The computation of the diff does not continue below these objects. + * So each such diff entry means that the difference between two subtrees + * could not be computed. + * + * \return 0 if the difference can be represented properly. + * + * \return 0 with \p diff pointing to NULL if there is no difference + * between the topologies. + * + * \return 1 if the difference is too complex (see above). Some entries in + * the list will be of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX. + * + * \return -1 on any other error. + * + * \note \p flags is currently not used. It should be 0. + * + * \note The output diff has to be freed with hwloc_topology_diff_destroy(). + * + * \note The output diff can only be exported to XML or passed to + * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type + * ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed. + * + * \note The output diff may be modified by removing some entries from + * the list. The removed entries should be freed by passing them to + * to hwloc_topology_diff_destroy() (possible as another list). +*/ +HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff); + +/** \brief Flags to be given to hwloc_topology_diff_apply(). + */ +enum hwloc_topology_diff_apply_flags_e { + /** \brief Apply topology diff in reverse direction. + * \hideinitializer + */ + HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0) +}; + +/** \brief Apply a topology diff to an existing topology. + * + * \p flags is an OR'ed set of ::hwloc_topology_diff_apply_flags_e. + * + * The new topology is modified in place. hwloc_topology_dup() + * may be used to duplicate it before patching. + * + * If the difference cannot be applied entirely, all previous applied + * elements are unapplied before returning. + * + * \return 0 on success. + * + * \return -N if applying the difference failed while trying + * to apply the N-th part of the difference. For instance -1 + * is returned if the very first difference element could not + * be applied. + */ +HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags); + +/** \brief Destroy a list of topology differences. + */ +HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff); + +/** \brief Load a list of topology differences from a XML file. + * + * If not \c NULL, \p refname will be filled with the identifier + * string of the reference topology for the difference file, + * if any was specified in the XML file. + * This identifier is usually the name of the other XML file + * that contains the reference topology. + * + * \note the pointer returned in refname should later be freed + * by the caller. + */ +HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topology_diff_t *diff, char **refname); + +/** \brief Export a list of topology differences to a XML file. + * + * If not \c NULL, \p refname defines an identifier string + * for the reference topology which was used as a base when + * computing this difference. + * This identifier is usually the name of the other XML file + * that contains the reference topology. + * This attribute is given back when reading the diff from XML. + */ +HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath); + +/** \brief Load a list of topology differences from a XML buffer. + * + * If not \c NULL, \p refname will be filled with the identifier + * string of the reference topology for the difference file, + * if any was specified in the XML file. + * This identifier is usually the name of the other XML file + * that contains the reference topology. + * + * \note the pointer returned in refname should later be freed + * by the caller. + */ +HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname); + +/** \brief Export a list of topology differences to a XML buffer. + * + * If not \c NULL, \p refname defines an identifier string + * for the reference topology which was used as a base when + * computing this difference. + * This identifier is usually the name of the other XML file + * that contains the reference topology. + * This attribute is given back when reading the diff from XML. + * + * The returned buffer ends with a \0 that is included in the returned + * length. + * + * \note The XML buffer should later be freed with hwloc_free_xmlbuffer(). + */ +HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen); + +/** @} */ + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_DIFF_H */ diff --git a/deps/hwloc/include/hwloc/distances.h b/deps/hwloc/include/hwloc/distances.h new file mode 100644 index 000000000..c12856cd8 --- /dev/null +++ b/deps/hwloc/include/hwloc/distances.h @@ -0,0 +1,447 @@ +/* + * Copyright © 2010-2021 Inria. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief Object distances. + */ + +#ifndef HWLOC_DISTANCES_H +#define HWLOC_DISTANCES_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + + +#ifdef __cplusplus +extern "C" { +#elif 0 +} +#endif + + +/** \defgroup hwlocality_distances_get Retrieve distances between objects + * @{ + */ + +/** \brief Matrix of distances between a set of objects. + * + * This matrix often contains latencies between NUMA nodes + * (as reported in the System Locality Distance Information Table (SLIT) + * in the ACPI specification), which may or may not be physically accurate. + * It corresponds to the latency for accessing the memory of one node + * from a core in another node. + * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER. + * The name of this distances structure is "NUMALatency". + * Others distance structures include and "XGMIBandwidth" and "NVLinkBandwidth". + * + * The matrix may also contain bandwidths between random sets of objects, + * possibly provided by the user, as specified in the \p kind attribute. + * + * Pointers \p objs and \p values should not be replaced, reallocated, freed, etc. + * However callers are allowed to modify \p kind as well as the contents + * of \p objs and \p values arrays. + * For instance, if there is a single NUMA node per Package, + * hwloc_get_obj_with_same_locality() may be used to convert between them + * and replace NUMA nodes in the \p objs array with the corresponding Packages. + */ +struct hwloc_distances_s { + unsigned nbobjs; /**< \brief Number of objects described by the distance matrix. */ + hwloc_obj_t *objs; /**< \brief Array of objects described by the distance matrix. + * These objects are not in any particular order, + * see hwloc_distances_obj_index() and hwloc_distances_obj_pair_values() + * for easy ways to find objects in this array and their corresponding values. + */ + unsigned long kind; /**< \brief OR'ed set of ::hwloc_distances_kind_e. */ + hwloc_uint64_t *values; /**< \brief Matrix of distances between objects, stored as a one-dimension array. + * + * Distance from i-th to j-th object is stored in slot i*nbobjs+j. + * The meaning of the value depends on the \p kind attribute. + */ +}; + +/** \brief Kinds of distance matrices. + * + * The \p kind attribute of struct hwloc_distances_s is a OR'ed set + * of kinds. + * + * A kind of format HWLOC_DISTANCES_KIND_FROM_* specifies where the + * distance information comes from, if known. + * + * A kind of format HWLOC_DISTANCES_KIND_MEANS_* specifies whether + * values are latencies or bandwidths, if applicable. + */ +enum hwloc_distances_kind_e { + /** \brief These distances were obtained from the operating system or hardware. + * \hideinitializer + */ + HWLOC_DISTANCES_KIND_FROM_OS = (1UL<<0), + /** \brief These distances were provided by the user. + * \hideinitializer + */ + HWLOC_DISTANCES_KIND_FROM_USER = (1UL<<1), + + /** \brief Distance values are similar to latencies between objects. + * Values are smaller for closer objects, hence minimal on the diagonal + * of the matrix (distance between an object and itself). + * It could also be the number of network hops between objects, etc. + * \hideinitializer + */ + HWLOC_DISTANCES_KIND_MEANS_LATENCY = (1UL<<2), + /** \brief Distance values are similar to bandwidths between objects. + * Values are higher for closer objects, hence maximal on the diagonal + * of the matrix (distance between an object and itself). + * Such values are currently ignored for distance-based grouping. + * \hideinitializer + */ + HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH = (1UL<<3), + + /** \brief This distances structure covers objects of different types. + * This may apply to the "NVLinkBandwidth" structure in presence + * of a NVSwitch or POWER processor NVLink port. + * \hideinitializer + */ + HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES = (1UL<<4) +}; + +/** \brief Retrieve distance matrices. + * + * Retrieve distance matrices from the topology into the \p distances array. + * + * \p flags is currently unused, should be \c 0. + * + * \p kind serves as a filter. If \c 0, all distance matrices are returned. + * If it contains some HWLOC_DISTANCES_KIND_FROM_*, only distance matrices + * whose kind matches one of these are returned. + * If it contains some HWLOC_DISTANCES_KIND_MEANS_*, only distance matrices + * whose kind matches one of these are returned. + * + * On input, \p nr points to the number of distance matrices that may be stored + * in \p distances. + * On output, \p nr points to the number of distance matrices that were actually + * found, even if some of them couldn't be stored in \p distances. + * Distance matrices that couldn't be stored are ignored, but the function still + * returns success (\c 0). The caller may find out by comparing the value pointed + * by \p nr before and after the function call. + * + * Each distance matrix returned in the \p distances array should be released + * by the caller using hwloc_distances_release(). + */ +HWLOC_DECLSPEC int +hwloc_distances_get(hwloc_topology_t topology, + unsigned *nr, struct hwloc_distances_s **distances, + unsigned long kind, unsigned long flags); + +/** \brief Retrieve distance matrices for object at a specific depth in the topology. + * + * Identical to hwloc_distances_get() with the additional \p depth filter. + */ +HWLOC_DECLSPEC int +hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth, + unsigned *nr, struct hwloc_distances_s **distances, + unsigned long kind, unsigned long flags); + +/** \brief Retrieve distance matrices for object of a specific type. + * + * Identical to hwloc_distances_get() with the additional \p type filter. + */ +HWLOC_DECLSPEC int +hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type, + unsigned *nr, struct hwloc_distances_s **distances, + unsigned long kind, unsigned long flags); + +/** \brief Retrieve a distance matrix with the given name. + * + * Usually only one distances structure may match a given name. + * + * The name of the most common structure is "NUMALatency". + * Others include "XGMIBandwidth" and "NVLinkBandwidth". + */ +HWLOC_DECLSPEC int +hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name, + unsigned *nr, struct hwloc_distances_s **distances, + unsigned long flags); + +/** \brief Get a description of what a distances structure contains. + * + * For instance "NUMALatency" for hardware-provided NUMA distances (ACPI SLIT), + * or NULL if unknown. + */ +HWLOC_DECLSPEC const char * +hwloc_distances_get_name(hwloc_topology_t topology, struct hwloc_distances_s *distances); + +/** \brief Release a distance matrix structure previously returned by hwloc_distances_get(). + * + * \note This function is not required if the structure is removed with hwloc_distances_release_remove(). + */ +HWLOC_DECLSPEC void +hwloc_distances_release(hwloc_topology_t topology, struct hwloc_distances_s *distances); + +/** \brief Transformations of distances structures. */ +enum hwloc_distances_transform_e { + /** \brief Remove \c NULL objects from the distances structure. + * + * Every object that was replaced with \c NULL in the \p objs array + * is removed and the \p values array is updated accordingly. + * + * At least \c 2 objects must remain, otherwise hwloc_distances_transform() + * will return \c -1 with \p errno set to \c EINVAL. + * + * \p kind will be updated with or without ::HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES + * according to the remaining objects. + * + * \hideinitializer + */ + HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL = 0, + + /** \brief Replace bandwidth values with a number of links. + * + * Usually all values will be either \c 0 (no link) or \c 1 (one link). + * However some matrices could get larger values if some pairs of + * peers are connected by different numbers of links. + * + * Values on the diagonal are set to \c 0. + * + * This transformation only applies to bandwidth matrices. + * + * \hideinitializer + */ + HWLOC_DISTANCES_TRANSFORM_LINKS = 1 +}; + +/** \brief Apply a transformation to a distances structure. + * + * Modify a distances structure that was previously obtained with + * hwloc_distances_get() or one of its variants. + * + * This modifies the local copy of the distances structures but does + * not modify the distances information stored inside the topology + * (retrieved by another call to hwloc_distances_get() or exported to XML). + * To do so, one should add a new distances structure with same + * name, kind, objects and values (see \ref hwlocality_distances_add) + * and then remove this old one with hwloc_distances_release_remove(). + * + * \p transform must be one of the transformations listed + * in ::hwloc_distances_transform_e. + * + * These transformations may modify the contents of the \p objs or \p values arrays. + * + * \p transform_attr must be \c NULL for now. + * + * \p flags must be \c 0 for now. + */ +HWLOC_DECLSPEC int hwloc_distances_transform(hwloc_topology_t topology, struct hwloc_distances_s *distances, + enum hwloc_distances_transform_e transform, + void *transform_attr, + unsigned long flags); + +/** @} */ + + + +/** \defgroup hwlocality_distances_consult Helpers for consulting distance matrices + * @{ + */ + +/** \brief Find the index of an object in a distances structure. + * + * \return -1 if object \p obj is not involved in structure \p distances. + */ +static __hwloc_inline int +hwloc_distances_obj_index(struct hwloc_distances_s *distances, hwloc_obj_t obj) +{ + unsigned i; + for(i=0; i<distances->nbobjs; i++) + if (distances->objs[i] == obj) + return (int)i; + return -1; +} + +/** \brief Find the values between two objects in a distance matrices. + * + * The distance from \p obj1 to \p obj2 is stored in the value pointed by + * \p value1to2 and reciprocally. + * + * \return -1 if object \p obj1 or \p obj2 is not involved in structure \p distances. + */ +static __hwloc_inline int +hwloc_distances_obj_pair_values(struct hwloc_distances_s *distances, + hwloc_obj_t obj1, hwloc_obj_t obj2, + hwloc_uint64_t *value1to2, hwloc_uint64_t *value2to1) +{ + int i1 = hwloc_distances_obj_index(distances, obj1); + int i2 = hwloc_distances_obj_index(distances, obj2); + if (i1 < 0 || i2 < 0) + return -1; + *value1to2 = distances->values[i1 * distances->nbobjs + i2]; + *value2to1 = distances->values[i2 * distances->nbobjs + i1]; + return 0; +} + +/** @} */ + + + +/** \defgroup hwlocality_distances_add Add distances between objects + * + * The usual way to add distances is: + * \code + * hwloc_distances_add_handle_t handle; + * int err = -1; + * handle = hwloc_distances_add_create(topology, "name", kind, 0); + * if (handle) { + * err = hwloc_distances_add_values(topology, handle, nbobjs, objs, values, 0); + * if (!err) + * err = hwloc_distances_add_commit(topology, handle, flags); + * } + * \endcode + * If \p err is \c 0 at the end, then addition was successful. + * + * @{ + */ + +/** \brief Handle to a new distances structure during its addition to the topology. */ +typedef void * hwloc_distances_add_handle_t; + +/** \brief Create a new empty distances structure. + * + * Create an empty distances structure + * to be filled with hwloc_distances_add_values() + * and then committed with hwloc_distances_add_commit(). + * + * Parameter \p name is optional, it may be \c NULL. + * Otherwise, it will be copied internally and may later be freed by the caller. + * + * \p kind specifies the kind of distance as a OR'ed set of ::hwloc_distances_kind_e. + * Kind ::HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES will be automatically set + * according to objects having different types in hwloc_distances_add_values(). + * + * \p flags must be \c 0 for now. + * + * \return A hwloc_distances_add_handle_t that should then be passed + * to hwloc_distances_add_values() and hwloc_distances_add_commit(). + * + * \return \c NULL on error. + */ +HWLOC_DECLSPEC hwloc_distances_add_handle_t +hwloc_distances_add_create(hwloc_topology_t topology, + const char *name, unsigned long kind, + unsigned long flags); + +/** \brief Specify the objects and values in a new empty distances structure. + * + * Specify the objects and values for a new distances structure + * that was returned as a handle by hwloc_distances_add_create(). + * The structure must then be committed with hwloc_distances_add_commit(). + * + * The number of objects is \p nbobjs and the array of objects is \p objs. + * Distance values are stored as a one-dimension array in \p values. + * The distance from object i to object j is in slot i*nbobjs+j. + * + * \p nbobjs must be at least 2. + * + * Arrays \p objs and \p values will be copied internally, + * they may later be freed by the caller. + * + * On error, the temporary distances structure and its content are destroyed. + * + * \p flags must be \c 0 for now. + * + * \return \c 0 on success. + * \return \c -1 on error. + */ +HWLOC_DECLSPEC int hwloc_distances_add_values(hwloc_topology_t topology, + hwloc_distances_add_handle_t handle, + unsigned nbobjs, hwloc_obj_t *objs, + hwloc_uint64_t *values, + unsigned long flags); + +/** \brief Flags for adding a new distances to a topology. */ +enum hwloc_distances_add_flag_e { + /** \brief Try to group objects based on the newly provided distance information. + * This is ignored for distances between objects of different types. + * \hideinitializer + */ + HWLOC_DISTANCES_ADD_FLAG_GROUP = (1UL<<0), + /** \brief If grouping, consider the distance values as inaccurate and relax the + * comparisons during the grouping algorithms. The actual accuracy may be modified + * through the HWLOC_GROUPING_ACCURACY environment variable (see \ref envvar). + * \hideinitializer + */ + HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE = (1UL<<1) +}; + +/** \brief Commit a new distances structure. + * + * This function finalizes the distances structure and inserts in it the topology. + * + * Parameter \p handle was previously returned by hwloc_distances_add_create(). + * Then objects and values were specified with hwloc_distances_add_values(). + * + * \p flags configures the behavior of the function using an optional OR'ed set of + * ::hwloc_distances_add_flag_e. + * It may be used to request the grouping of existing objects based on distances. + * + * On error, the temporary distances structure and its content are destroyed. + * + * \return \c 0 on success. + * \return \c -1 on error. + */ +HWLOC_DECLSPEC int hwloc_distances_add_commit(hwloc_topology_t topology, + hwloc_distances_add_handle_t handle, + unsigned long flags); + +/** @} */ + + + +/** \defgroup hwlocality_distances_remove Remove distances between objects + * @{ + */ + +/** \brief Remove all distance matrices from a topology. + * + * Remove all distance matrices, either provided by the user or + * gathered through the OS. + * + * If these distances were used to group objects, these additional + * Group objects are not removed from the topology. + */ +HWLOC_DECLSPEC int hwloc_distances_remove(hwloc_topology_t topology); + +/** \brief Remove distance matrices for objects at a specific depth in the topology. + * + * Identical to hwloc_distances_remove() but only applies to one level of the topology. + */ +HWLOC_DECLSPEC int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth); + +/** \brief Remove distance matrices for objects of a specific type in the topology. + * + * Identical to hwloc_distances_remove() but only applies to one level of the topology. + */ +static __hwloc_inline int +hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return 0; + return hwloc_distances_remove_by_depth(topology, depth); +} + +/** \brief Release and remove the given distance matrice from the topology. + * + * This function includes a call to hwloc_distances_release(). + */ +HWLOC_DECLSPEC int hwloc_distances_release_remove(hwloc_topology_t topology, struct hwloc_distances_s *distances); + +/** @} */ + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_DISTANCES_H */ diff --git a/deps/hwloc/include/hwloc/export.h b/deps/hwloc/include/hwloc/export.h new file mode 100644 index 000000000..b178b77e5 --- /dev/null +++ b/deps/hwloc/include/hwloc/export.h @@ -0,0 +1,278 @@ +/* + * Copyright © 2009-2018 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief Exporting Topologies to XML or to Synthetic strings. + */ + +#ifndef HWLOC_EXPORT_H +#define HWLOC_EXPORT_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + + +#ifdef __cplusplus +extern "C" { +#elif 0 +} +#endif + + +/** \defgroup hwlocality_xmlexport Exporting Topologies to XML + * @{ + */ + +/** \brief Flags for exporting XML topologies. + * + * Flags to be given as a OR'ed set to hwloc_topology_export_xml(). + */ +enum hwloc_topology_export_xml_flags_e { + /** \brief Export XML that is loadable by hwloc v1.x. + * However, the export may miss some details about the topology. + * \hideinitializer + */ + HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 = (1UL<<0) +}; + +/** \brief Export the topology into an XML file. + * + * This file may be loaded later through hwloc_topology_set_xml(). + * + * By default, the latest export format is used, which means older hwloc + * releases (e.g. v1.x) will not be able to import it. + * Exporting to v1.x specific XML format is possible using flag + * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details + * about the topology. + * If there is any chance that the exported file may ever be imported + * back by a process using hwloc 1.x, one should consider detecting + * it at runtime and using the corresponding export format. + * + * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e. + * + * \return -1 if a failure occured. + * + * \note See also hwloc_topology_set_userdata_export_callback() + * for exporting application-specific object userdata. + * + * \note The topology-specific userdata pointer is ignored when exporting to XML. + * + * \note Only printable characters may be exported to XML string attributes. + * Any other character, especially any non-ASCII character, will be silently + * dropped. + * + * \note If \p name is "-", the XML output is sent to the standard output. + */ +HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath, unsigned long flags); + +/** \brief Export the topology into a newly-allocated XML memory buffer. + * + * \p xmlbuffer is allocated by the callee and should be freed with + * hwloc_free_xmlbuffer() later in the caller. + * + * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer(). + * + * By default, the latest export format is used, which means older hwloc + * releases (e.g. v1.x) will not be able to import it. + * Exporting to v1.x specific XML format is possible using flag + * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details + * about the topology. + * If there is any chance that the exported buffer may ever be imported + * back by a process using hwloc 1.x, one should consider detecting + * it at runtime and using the corresponding export format. + * + * The returned buffer ends with a \0 that is included in the returned + * length. + * + * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e. + * + * \return -1 if a failure occured. + * + * \note See also hwloc_topology_set_userdata_export_callback() + * for exporting application-specific object userdata. + * + * \note The topology-specific userdata pointer is ignored when exporting to XML. + * + * \note Only printable characters may be exported to XML string attributes. + * Any other character, especially any non-ASCII character, will be silently + * dropped. + */ +HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags); + +/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */ +HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer); + +/** \brief Set the application-specific callback for exporting object userdata + * + * The object userdata pointer is not exported to XML by default because hwloc + * does not know what it contains. + * + * This function lets applications set \p export_cb to a callback function + * that converts this opaque userdata into an exportable string. + * + * \p export_cb is invoked during XML export for each object whose + * \p userdata pointer is not \c NULL. + * The callback should use hwloc_export_obj_userdata() or + * hwloc_export_obj_userdata_base64() to actually export + * something to XML (possibly multiple times per object). + * + * \p export_cb may be set to \c NULL if userdata should not be exported to XML. + * + * \note The topology-specific userdata pointer is ignored when exporting to XML. + */ +HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology, + void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj)); + +/** \brief Export some object userdata to XML + * + * This function may only be called from within the export() callback passed + * to hwloc_topology_set_userdata_export_callback(). + * It may be invoked one of multiple times to export some userdata to XML. + * The \p buffer content of length \p length is stored with optional name + * \p name. + * + * When importing this XML file, the import() callback (if set) will be + * called exactly as many times as hwloc_export_obj_userdata() was called + * during export(). It will receive the corresponding \p name, \p buffer + * and \p length arguments. + * + * \p reserved, \p topology and \p obj must be the first three parameters + * that were given to the export callback. + * + * Only printable characters may be exported to XML string attributes. + * If a non-printable character is passed in \p name or \p buffer, + * the function returns -1 with errno set to EINVAL. + * + * If exporting binary data, the application should first encode into + * printable characters only (or use hwloc_export_obj_userdata_base64()). + * It should also take care of portability issues if the export may + * be reimported on a different architecture. + */ +HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length); + +/** \brief Encode and export some object userdata to XML + * + * This function is similar to hwloc_export_obj_userdata() but it encodes + * the input buffer into printable characters before exporting. + * On import, decoding is automatically performed before the data is given + * to the import() callback if any. + * + * This function may only be called from within the export() callback passed + * to hwloc_topology_set_userdata_export_callback(). + * + * The function does not take care of portability issues if the export + * may be reimported on a different architecture. + */ +HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length); + +/** \brief Set the application-specific callback for importing userdata + * + * On XML import, userdata is ignored by default because hwloc does not know + * how to store it in memory. + * + * This function lets applications set \p import_cb to a callback function + * that will get the XML-stored userdata and store it in the object as expected + * by the application. + * + * \p import_cb is called during hwloc_topology_load() as many times as + * hwloc_export_obj_userdata() was called during export. The topology + * is not entirely setup yet. Object attributes are ready to consult, + * but links between objects are not. + * + * \p import_cb may be \c NULL if userdata should be ignored during import. + * + * \note \p buffer contains \p length characters followed by a null byte ('\0'). + * + * \note This function should be called before hwloc_topology_load(). + * + * \note The topology-specific userdata pointer is ignored when importing from XML. + */ +HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology, + void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length)); + +/** @} */ + + +/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic + * @{ + */ + +/** \brief Flags for exporting synthetic topologies. + * + * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic(). + */ +enum hwloc_topology_export_synthetic_flags_e { + /** \brief Export extended types such as L2dcache as basic types such as Cache. + * + * This is required if loading the synthetic description with hwloc < 1.9. + * \hideinitializer + */ + HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0), + + /** \brief Do not export level attributes. + * + * Ignore level attributes such as memory/cache sizes or PU indexes. + * This is required if loading the synthetic description with hwloc < 1.10. + * \hideinitializer + */ + HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1), + + /** \brief Export the memory hierarchy as expected in hwloc 1.x. + * + * Instead of attaching memory children to levels, export single NUMA node child + * as normal intermediate levels, when possible. + * This is required if loading the synthetic description with hwloc 1.x. + * However this may fail if some objects have multiple local NUMA nodes. + * \hideinitializer + */ + HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 = (1UL<<2), + + /** \brief Do not export memory information. + * + * Only export the actual hierarchy of normal CPU-side objects and ignore + * where memory is attached. + * This is useful for when the hierarchy of CPUs is what really matters, + * but it behaves as if there was a single machine-wide NUMA node. + * \hideinitializer + */ + HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY = (1UL<<3) +}; + +/** \brief Export the topology as a synthetic string. + * + * At most \p buflen characters will be written in \p buffer, + * including the terminating \0. + * + * This exported string may be given back to hwloc_topology_set_synthetic(). + * + * \p flags is a OR'ed set of ::hwloc_topology_export_synthetic_flags_e. + * + * \return The number of characters that were written, + * not including the terminating \0. + * + * \return -1 if the topology could not be exported, + * for instance if it is not symmetric. + * + * \note I/O and Misc children are ignored, the synthetic string only + * describes normal children. + * + * \note A 1024-byte buffer should be large enough for exporting + * topologies in the vast majority of cases. + */ + HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags); + +/** @} */ + + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_EXPORT_H */ diff --git a/deps/hwloc/include/hwloc/helper.h b/deps/hwloc/include/hwloc/helper.h new file mode 100644 index 000000000..f918d8163 --- /dev/null +++ b/deps/hwloc/include/hwloc/helper.h @@ -0,0 +1,1231 @@ +/* + * Copyright © 2009 CNRS + * Copyright © 2009-2021 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief High-level hwloc traversal helpers. + */ + +#ifndef HWLOC_HELPER_H +#define HWLOC_HELPER_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + +#include <stdlib.h> +#include <errno.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set + * @{ + */ + +/** \brief Get the first largest object included in the given cpuset \p set. + * + * \return the first object that is included in \p set and whose parent is not. + * + * This is convenient for iterating over all largest objects within a CPU set + * by doing a loop getting the first largest object and clearing its CPU set + * from the remaining CPU set. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set) +{ + hwloc_obj_t obj = hwloc_get_root_obj(topology); + if (!hwloc_bitmap_intersects(obj->cpuset, set)) + return NULL; + while (!hwloc_bitmap_isincluded(obj->cpuset, set)) { + /* while the object intersects without being included, look at its children */ + hwloc_obj_t child = obj->first_child; + while (child) { + if (hwloc_bitmap_intersects(child->cpuset, set)) + break; + child = child->next_sibling; + } + if (!child) + /* no child intersects, return their father */ + return obj; + /* found one intersecting child, look at its children */ + obj = child; + } + /* obj is included, return it */ + return obj; +} + +/** \brief Get the set of largest objects covering exactly a given cpuset \p set + * + * \return the number of objects returned in \p objs. + */ +HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_t * __hwloc_restrict objs, int max); + +/** \brief Return the next object at depth \p depth included in CPU set \p set. + * + * If \p prev is \c NULL, return the first object at depth \p depth + * included in \p set. The next invokation should pass the previous + * return value in \p prev so as to obtain the next object in \p set. + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects at the given depth do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth, hwloc_obj_t prev) +{ + hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev); + if (!next) + return NULL; + while (next && (hwloc_bitmap_iszero(next->cpuset) || !hwloc_bitmap_isincluded(next->cpuset, set))) + next = next->next_cousin; + return next; +} + +/** \brief Return the next object of type \p type included in CPU set \p set. + * + * If there are multiple or no depth for given type, return \c NULL + * and let the caller fallback to + * hwloc_get_next_obj_inside_cpuset_by_depth(). + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects of the given type do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type, hwloc_obj_t prev) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return NULL; + return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev); +} + +/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set. + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects at the given depth do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth, unsigned idx) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth, unsigned idx) +{ + hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0); + unsigned count = 0; + if (!obj) + return NULL; + while (obj) { + if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) { + if (count == idx) + return obj; + count++; + } + obj = obj->next_cousin; + } + return NULL; +} + +/** \brief Return the \p idx -th object of type \p type included in CPU set \p set. + * + * If there are multiple or no depth for given type, return \c NULL + * and let the caller fallback to + * hwloc_get_obj_inside_cpuset_by_depth(). + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects of the given type do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type, unsigned idx) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return NULL; + return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx); +} + +/** \brief Return the number of objects at depth \p depth included in CPU set \p set. + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects at the given depth do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline unsigned +hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth) __hwloc_attribute_pure; +static __hwloc_inline unsigned +hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth) +{ + hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0); + unsigned count = 0; + if (!obj) + return 0; + while (obj) { + if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) + count++; + obj = obj->next_cousin; + } + return count; +} + +/** \brief Return the number of objects of type \p type included in CPU set \p set. + * + * If no object for that type exists inside CPU set \p set, 0 is + * returned. If there are several levels with objects of that type + * inside CPU set \p set, -1 is returned. + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if objects of the given type do + * not have CPU sets (I/O objects). + */ +static __hwloc_inline int +hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type) __hwloc_attribute_pure; +static __hwloc_inline int +hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) + return 0; + if (depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return -1; /* FIXME: agregate nbobjs from different levels? */ + return (int) hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth); +} + +/** \brief Return the logical index among the objects included in CPU set \p set. + * + * Consult all objects in the same level as \p obj and inside CPU set \p set + * in the logical order, and return the index of \p obj within them. + * If \p set covers the entire topology, this is the logical index of \p obj. + * Otherwise, this is similar to a logical index within the part of the topology + * defined by CPU set \p set. + * + * \note Objects with empty CPU sets are ignored + * (otherwise they would be considered included in any given set). + * + * \note This function cannot work if obj does not have CPU sets (I/O objects). + */ +static __hwloc_inline int +hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set, + hwloc_obj_t obj) __hwloc_attribute_pure; +static __hwloc_inline int +hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set, + hwloc_obj_t obj) +{ + int idx = 0; + if (!hwloc_bitmap_isincluded(obj->cpuset, set)) + return -1; + /* count how many objects are inside the cpuset on the way from us to the beginning of the level */ + while ((obj = obj->prev_cousin) != NULL) + if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) + idx++; + return idx; +} + +/** @} */ + + + +/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set + * @{ + */ + +/** \brief Get the child covering at least CPU set \p set. + * + * \return \c NULL if no child matches or if \p set is empty. + * + * \note This function cannot work if parent does not have a CPU set (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set, + hwloc_obj_t parent) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set, + hwloc_obj_t parent) +{ + hwloc_obj_t child; + if (hwloc_bitmap_iszero(set)) + return NULL; + child = parent->first_child; + while (child) { + if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset)) + return child; + child = child->next_sibling; + } + return NULL; +} + +/** \brief Get the lowest object covering at least CPU set \p set + * + * \return \c NULL if no object matches or if \p set is empty. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) +{ + struct hwloc_obj *current = hwloc_get_root_obj(topology); + if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset)) + return NULL; + while (1) { + hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current); + if (!child) + return current; + current = child; + } +} + +/** \brief Iterate through same-depth objects covering at least CPU set \p set + * + * If object \p prev is \c NULL, return the first object at depth \p + * depth covering at least part of CPU set \p set. The next + * invokation should pass the previous return value in \p prev so as + * to obtain the next object covering at least another part of \p set. + * + * \note This function cannot work if objects at the given depth do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set, + int depth, hwloc_obj_t prev) +{ + hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev); + if (!next) + return NULL; + while (next && !hwloc_bitmap_intersects(set, next->cpuset)) + next = next->next_cousin; + return next; +} + +/** \brief Iterate through same-type objects covering at least CPU set \p set + * + * If object \p prev is \c NULL, return the first object of type \p + * type covering at least part of CPU set \p set. The next invokation + * should pass the previous return value in \p prev so as to obtain + * the next object of type \p type covering at least another part of + * \p set. + * + * If there are no or multiple depths for type \p type, \c NULL is returned. + * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth() + * for each depth. + * + * \note This function cannot work if objects of the given type do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set, + hwloc_obj_type_t type, hwloc_obj_t prev) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return NULL; + return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev); +} + +/** @} */ + + + +/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects + * @{ + * + * Be sure to see the figure in \ref termsanddefs that shows a + * complete topology tree, including depths, child/sibling/cousin + * relationships, and an example of an asymmetric topology where one + * package has fewer caches than its peers. + */ + +/** \brief Returns the ancestor object of \p obj at depth \p depth. + * + * \note \p depth should not be the depth of PU or NUMA objects + * since they are ancestors of no objects (except Misc or I/O). + * This function rather expects an intermediate level depth, + * such as the depth of Packages, Cores, or Caches. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj) +{ + hwloc_obj_t ancestor = obj; + if (obj->depth < depth) + return NULL; + while (ancestor && ancestor->depth > depth) + ancestor = ancestor->parent; + return ancestor; +} + +/** \brief Returns the ancestor object of \p obj with type \p type. + * + * \note \p type should not be ::HWLOC_OBJ_PU or ::HWLOC_OBJ_NUMANODE + * since these objects are ancestors of no objects (except Misc or I/O). + * This function rather expects an intermediate object type, + * such as ::HWLOC_OBJ_PACKAGE, ::HWLOC_OBJ_CORE, etc. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) +{ + hwloc_obj_t ancestor = obj->parent; + while (ancestor && ancestor->type != type) + ancestor = ancestor->parent; + return ancestor; +} + +/** \brief Returns the common parent object to objects \p obj1 and \p obj2 */ +static __hwloc_inline hwloc_obj_t +hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) +{ + /* the loop isn't so easy since intermediate ancestors may have + * different depth, causing us to alternate between using obj1->parent + * and obj2->parent. Also, even if at some point we find ancestors of + * of the same depth, their ancestors may have different depth again. + */ + while (obj1 != obj2) { + while (obj1->depth > obj2->depth) + obj1 = obj1->parent; + while (obj2->depth > obj1->depth) + obj2 = obj2->parent; + if (obj1 != obj2 && obj1->depth == obj2->depth) { + obj1 = obj1->parent; + obj2 = obj2->parent; + } + } + return obj1; +} + +/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root. + * + * \note This function cannot work if \p obj and \p subtree_root objects do + * not have CPU sets (I/O or Misc objects). + */ +static __hwloc_inline int +hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure; +static __hwloc_inline int +hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) +{ + return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset); +} + +/** \brief Return the next child. + * + * Return the next child among the normal children list, + * then among the memory children list, then among the I/O + * children list, then among the Misc children list. + * + * If \p prev is \c NULL, return the first child. + * + * Return \c NULL when there is no next child. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev) +{ + hwloc_obj_t obj; + int state = 0; + if (prev) { + if (prev->type == HWLOC_OBJ_MISC) + state = 3; + else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE) + state = 2; + else if (prev->type == HWLOC_OBJ_NUMANODE) + state = 1; + obj = prev->next_sibling; + } else { + obj = parent->first_child; + } + if (!obj && state == 0) { + obj = parent->memory_first_child; + state = 1; + } + if (!obj && state == 1) { + obj = parent->io_first_child; + state = 2; + } + if (!obj && state == 2) { + obj = parent->misc_first_child; + state = 3; + } + return obj; +} + +/** @} */ + + + +/** \defgroup hwlocality_helper_types Kinds of object Type + * @{ + * + * Each object type is + * either Normal (i.e. hwloc_obj_type_is_normal() returns 1), + * or Memory (i.e. hwloc_obj_type_is_memory() returns 1) + * or I/O (i.e. hwloc_obj_type_is_io() returns 1) + * or Misc (i.e. equal to ::HWLOC_OBJ_MISC). + * It cannot be of more than one of these kinds. + */ + +/** \brief Check whether an object type is Normal. + * + * Normal objects are objects of the main CPU hierarchy + * (Machine, Package, Core, PU, CPU caches, etc.), + * but they are not NUMA nodes, I/O devices or Misc objects. + * + * They are attached to parent as Normal children, + * not as Memory, I/O or Misc children. + * + * \return 1 if an object of type \p type is a Normal object, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_normal(hwloc_obj_type_t type); + +/** \brief Check whether an object type is I/O. + * + * I/O objects are objects attached to their parents + * in the I/O children list. + * This current includes Bridges, PCI and OS devices. + * + * \return 1 if an object of type \p type is a I/O object, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_io(hwloc_obj_type_t type); + +/** \brief Check whether an object type is Memory. + * + * Memory objects are objects attached to their parents + * in the Memory children list. + * This current includes NUMA nodes and Memory-side caches. + * + * \return 1 if an object of type \p type is a Memory object, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_memory(hwloc_obj_type_t type); + +/** \brief Check whether an object type is a CPU Cache (Data, Unified or Instruction). + * + * Memory-side caches are not CPU caches. + * + * \return 1 if an object of type \p type is a Cache, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_cache(hwloc_obj_type_t type); + +/** \brief Check whether an object type is a CPU Data or Unified Cache. + * + * Memory-side caches are not CPU caches. + * + * \return 1 if an object of type \p type is a CPU Data or Unified Cache, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_dcache(hwloc_obj_type_t type); + +/** \brief Check whether an object type is a CPU Instruction Cache, + * + * Memory-side caches are not CPU caches. + * + * \return 1 if an object of type \p type is a CPU Instruction Cache, 0 otherwise. + */ +HWLOC_DECLSPEC int +hwloc_obj_type_is_icache(hwloc_obj_type_t type); + +/** @} */ + + + +/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects + * @{ + */ + +/** \brief Find the depth of cache objects matching cache level and type. + * + * Return the depth of the topology level that contains cache objects + * whose attributes match \p cachelevel and \p cachetype. + + * This function is identical to calling hwloc_get_type_depth() with the + * corresponding type such as ::HWLOC_OBJ_L1ICACHE, except that it may + * also return a Unified cache when looking for an instruction cache. + * + * If no cache level matches, ::HWLOC_TYPE_DEPTH_UNKNOWN is returned. + * + * If \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED, the depth of the + * unique matching unified cache level is returned. + * + * If \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION, + * either a matching cache, or a unified cache is returned. + * + * If \p cachetype is \c -1, it is ignored and multiple levels may + * match. The function returns either the depth of a uniquely matching + * level or ::HWLOC_TYPE_DEPTH_MULTIPLE. + */ +static __hwloc_inline int +hwloc_get_cache_type_depth (hwloc_topology_t topology, + unsigned cachelevel, hwloc_obj_cache_type_t cachetype) +{ + int depth; + int found = HWLOC_TYPE_DEPTH_UNKNOWN; + for (depth=0; ; depth++) { + hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0); + if (!obj) + break; + if (!hwloc_obj_type_is_dcache(obj->type) || obj->attr->cache.depth != cachelevel) + /* doesn't match, try next depth */ + continue; + if (cachetype == (hwloc_obj_cache_type_t) -1) { + if (found != HWLOC_TYPE_DEPTH_UNKNOWN) { + /* second match, return MULTIPLE */ + return HWLOC_TYPE_DEPTH_MULTIPLE; + } + /* first match, mark it as found */ + found = depth; + continue; + } + if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED) + /* exact match (either unified is alone, or we match instruction or data), return immediately */ + return depth; + } + /* went to the bottom, return what we found */ + return found; +} + +/** \brief Get the first data (or unified) cache covering a cpuset \p set + * + * \return \c NULL if no cache matches. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) +{ + hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set); + while (current) { + if (hwloc_obj_type_is_dcache(current->type)) + return current; + current = current->parent; + } + return NULL; +} + +/** \brief Get the first data (or unified) cache shared between an object and somebody else. + * + * \return \c NULL if no cache matches or if an invalid object is given. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) +{ + hwloc_obj_t current = obj->parent; + if (!obj->cpuset) + return NULL; + while (current) { + if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset) + && hwloc_obj_type_is_dcache(current->type)) + return current; + current = current->parent; + } + return NULL; +} + +/** @} */ + + + +/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers + * @{ + * + * Be sure to see the figure in \ref termsanddefs that shows a + * complete topology tree, including depths, child/sibling/cousin + * relationships, and an example of an asymmetric topology where one + * package has fewer caches than its peers. + */ + +/** \brief Remove simultaneous multithreading PUs from a CPU set. + * + * For each core in \p topology, if \p cpuset contains some PUs of that core, + * modify \p cpuset to only keep a single PU for that core. + * + * \p which specifies which PU will be kept. + * PU are considered in physical index order. + * If 0, for each core, the function keeps the first PU that was originally set in \p cpuset. + * + * If \p which is larger than the number of PUs in a core there were originally set in \p cpuset, + * no PU is kept for that core. + * + * \note PUs that are not below a Core object are ignored + * (for instance if the topology does not contain any Core object). + * None of them is removed from \p cpuset. + */ +HWLOC_DECLSPEC int hwloc_bitmap_singlify_per_core(hwloc_topology_t topology, hwloc_bitmap_t cpuset, unsigned which); + +/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index. + * + * This function is useful for converting a CPU set into the PU + * objects it contains. + * When retrieving the current binding (e.g. with hwloc_get_cpubind()), + * one may iterate over the bits of the resulting CPU set with + * hwloc_bitmap_foreach_begin(), and find the corresponding PUs + * with this function. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) +{ + hwloc_obj_t obj = NULL; + while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL) + if (obj->os_index == os_index) + return obj; + return NULL; +} + +/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index. + * + * This function is useful for converting a nodeset into the NUMA node + * objects it contains. + * When retrieving the current binding (e.g. with hwloc_get_membind() with HWLOC_MEMBIND_BYNODESET), + * one may iterate over the bits of the resulting nodeset with + * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes + * with this function. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) +{ + hwloc_obj_t obj = NULL; + while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL) + if (obj->os_index == os_index) + return obj; + return NULL; +} + +/** \brief Do a depth-first traversal of the topology to find and sort + * + * all objects that are at the same depth than \p src. + * Report in \p objs up to \p max physically closest ones to \p src. + * + * \return the number of objects returned in \p objs. + * + * \return 0 if \p src is an I/O object. + * + * \note This function requires the \p src object to have a CPU set. + */ +/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */ +HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max); + +/** \brief Find an object below another object, both specified by types and indexes. + * + * Start from the top system object and find object of type \p type1 + * and logical index \p idx1. Then look below this object and find another + * object of type \p type2 and logical index \p idx2. Indexes are specified + * within the parent, not withing the entire system. + * + * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2 + * is 3, return the fourth core object below the third package. + * + * \note This function requires these objects to have a CPU set. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_below_by_type (hwloc_topology_t topology, + hwloc_obj_type_t type1, unsigned idx1, + hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_below_by_type (hwloc_topology_t topology, + hwloc_obj_type_t type1, unsigned idx1, + hwloc_obj_type_t type2, unsigned idx2) +{ + hwloc_obj_t obj; + obj = hwloc_get_obj_by_type (topology, type1, idx1); + if (!obj) + return NULL; + return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2); +} + +/** \brief Find an object below a chain of objects specified by types and indexes. + * + * This is a generalized version of hwloc_get_obj_below_by_type(). + * + * Arrays \p typev and \p idxv must contain \p nr types and indexes. + * + * Start from the top system object and walk the arrays \p typev and \p idxv. + * For each type and logical index couple in the arrays, look under the previously found + * object to find the index-th object of the given type. + * Indexes are specified within the parent, not withing the entire system. + * + * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE, + * and idxv contains 0, 1 and 2, return the third core object below + * the second package below the first NUMA node. + * + * \note This function requires all these objects and the root object + * to have a CPU set. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure; +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) +{ + hwloc_obj_t obj = hwloc_get_root_obj(topology); + int i; + for(i=0; i<nr; i++) { + if (!obj) + return NULL; + obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]); + } + return obj; +} + +/** \brief Return an object of a different type with same locality. + * + * If the source object \p src is a normal or memory type, + * this function returns an object of type \p type with same + * CPU and node sets, either below or above in the hierarchy. + * + * If the source object \p src is a PCI or an OS device within a PCI + * device, the function may either return that PCI device, or another + * OS device in the same PCI parent. + * This may for instance be useful for converting between OS devices + * such as "nvml0" or "rsmi1" used in distance structures into the + * the PCI device, or the CUDA or OpenCL OS device that correspond + * to the same physical card. + * + * If not \c NULL, parameter \p subtype only select objects whose + * subtype attribute exists and is \p subtype (case-insensitively), + * for instance "OpenCL" or "CUDA". + * + * If not \c NULL, parameter \p nameprefix only selects objects whose + * name attribute exists and starts with \p nameprefix (case-insensitively), + * for instance "rsmi" for matching "rsmi0". + * + * If multiple objects match, the first one is returned. + * + * This function will not walk the hierarchy across bridges since + * the PCI locality may become different. + * This function cannot also convert between normal/memory objects + * and I/O or Misc objects. + * + * \p flags must be \c 0 for now. + * + * \return An object with identical locality, + * matching \p subtype and \p nameprefix if any. + * + * \return \c NULL if no matching object could be found, + * or if the source object and target type are incompatible, + * for instance if converting between CPU and I/O objects. + */ +HWLOC_DECLSPEC hwloc_obj_t +hwloc_get_obj_with_same_locality(hwloc_topology_t topology, hwloc_obj_t src, + hwloc_obj_type_t type, const char *subtype, const char *nameprefix, + unsigned long flags); + +/** @} */ + + + +/** \defgroup hwlocality_helper_distribute Distributing items over a topology + * @{ + */ + +/** \brief Flags to be given to hwloc_distrib(). + */ +enum hwloc_distrib_flags_e { + /** \brief Distrib in reverse order, starting from the last objects. + * \hideinitializer + */ + HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0) +}; + +/** \brief Distribute \p n items over the topology under \p roots + * + * Array \p set will be filled with \p n cpusets recursively distributed + * linearly over the topology under objects \p roots, down to depth \p until + * (which can be INT_MAX to distribute down to the finest level). + * + * \p n_roots is usually 1 and \p roots only contains the topology root object + * so as to distribute over the entire topology. + * + * This is typically useful when an application wants to distribute \p n + * threads over a machine, giving each of them as much private cache as + * possible and keeping them locally in number order. + * + * The caller may typically want to also call hwloc_bitmap_singlify() + * before binding a thread so that it does not move at all. + * + * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e. + * + * \note This function requires the \p roots objects to have a CPU set. + * + * \note This function replaces the now deprecated hwloc_distribute() + * and hwloc_distributev() functions. + */ +static __hwloc_inline int +hwloc_distrib(hwloc_topology_t topology, + hwloc_obj_t *roots, unsigned n_roots, + hwloc_cpuset_t *set, + unsigned n, + int until, unsigned long flags) +{ + unsigned i; + unsigned tot_weight; + unsigned given, givenweight; + hwloc_cpuset_t *cpusetp = set; + + if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) { + errno = EINVAL; + return -1; + } + + tot_weight = 0; + for (i = 0; i < n_roots; i++) + tot_weight += (unsigned) hwloc_bitmap_weight(roots[i]->cpuset); + + for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) { + unsigned chunk, weight; + hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i]; + hwloc_cpuset_t cpuset = root->cpuset; + while (!hwloc_obj_type_is_normal(root->type)) + /* If memory/io/misc, walk up to normal parent */ + root = root->parent; + weight = (unsigned) hwloc_bitmap_weight(cpuset); + if (!weight) + continue; + /* Give to root a chunk proportional to its weight. + * If previous chunks got rounded-up, we may get a bit less. */ + chunk = (( (givenweight+weight) * n + tot_weight-1) / tot_weight) + - (( givenweight * n + tot_weight-1) / tot_weight); + if (!root->arity || chunk <= 1 || root->depth >= until) { + /* We can't split any more, put everything there. */ + if (chunk) { + /* Fill cpusets with ours */ + unsigned j; + for (j=0; j < chunk; j++) + cpusetp[j] = hwloc_bitmap_dup(cpuset); + } else { + /* We got no chunk, just merge our cpuset to a previous one + * (the first chunk cannot be empty) + * so that this root doesn't get ignored. + */ + assert(given); + hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset); + } + } else { + /* Still more to distribute, recurse into children */ + hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags); + } + cpusetp += chunk; + given += chunk; + givenweight += weight; + } + + return 0; +} + +/** @} */ + + + +/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies + * @{ + */ + +/** \brief Get complete CPU set + * + * \return the complete CPU set of processors of the system. + * + * \note The returned cpuset is not newly allocated and should thus not be + * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy. + * + * \note This is equivalent to retrieving the root object complete CPU-set. + */ +HWLOC_DECLSPEC hwloc_const_cpuset_t +hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Get topology CPU set + * + * \return the CPU set of processors of the system for which hwloc + * provides topology information. This is equivalent to the cpuset of the + * system object. + * + * \note The returned cpuset is not newly allocated and should thus not be + * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy. + * + * \note This is equivalent to retrieving the root object CPU-set. + */ +HWLOC_DECLSPEC hwloc_const_cpuset_t +hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Get allowed CPU set + * + * \return the CPU set of allowed processors of the system. + * + * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set, + * this is identical to hwloc_topology_get_topology_cpuset(), which means + * all PUs are allowed. + * + * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying + * hwloc_bitmap_intersects() on the result of this function and on an object + * cpuset checks whether there are allowed PUs inside that object. + * Applying hwloc_bitmap_and() returns the list of these allowed PUs. + * + * \note The returned cpuset is not newly allocated and should thus not be + * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy. + */ +HWLOC_DECLSPEC hwloc_const_cpuset_t +hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Get complete node set + * + * \return the complete node set of memory of the system. + * + * \note The returned nodeset is not newly allocated and should thus not be + * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy. + * + * \note This is equivalent to retrieving the root object complete nodeset. + */ +HWLOC_DECLSPEC hwloc_const_nodeset_t +hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Get topology node set + * + * \return the node set of memory of the system for which hwloc + * provides topology information. This is equivalent to the nodeset of the + * system object. + * + * \note The returned nodeset is not newly allocated and should thus not be + * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy. + * + * \note This is equivalent to retrieving the root object nodeset. + */ +HWLOC_DECLSPEC hwloc_const_nodeset_t +hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** \brief Get allowed node set + * + * \return the node set of allowed memory of the system. + * + * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set, + * this is identical to hwloc_topology_get_topology_nodeset(), which means + * all NUMA nodes are allowed. + * + * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying + * hwloc_bitmap_intersects() on the result of this function and on an object + * nodeset checks whether there are allowed NUMA nodes inside that object. + * Applying hwloc_bitmap_and() returns the list of these allowed NUMA nodes. + * + * \note The returned nodeset is not newly allocated and should thus not be + * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy. + */ +HWLOC_DECLSPEC hwloc_const_nodeset_t +hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure; + +/** @} */ + + + +/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets + * + * @{ + */ + +/** \brief Convert a CPU set into a NUMA node set + * + * For each PU included in the input \p _cpuset, set the corresponding + * local NUMA node(s) in the output \p nodeset. + * + * If some NUMA nodes have no CPUs at all, this function never sets their + * indexes in the output node set, even if a full CPU set is given in input. + * + * Hence the entire topology CPU set is converted into the set of all nodes + * that have some local CPUs. + */ +static __hwloc_inline int +hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) +{ + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + hwloc_obj_t obj = NULL; + assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN); + hwloc_bitmap_zero(nodeset); + while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL) + if (hwloc_bitmap_set(nodeset, obj->os_index) < 0) + return -1; + return 0; +} + +/** \brief Convert a NUMA node set into a CPU set + * + * For each NUMA node included in the input \p nodeset, set the corresponding + * local PUs in the output \p _cpuset. + * + * If some CPUs have no local NUMA nodes, this function never sets their + * indexes in the output CPU set, even if a full node set is given in input. + * + * Hence the entire topology node set is converted into the set of all CPUs + * that have some local NUMA nodes. + */ +static __hwloc_inline int +hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) +{ + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + hwloc_obj_t obj = NULL; + assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN); + hwloc_bitmap_zero(_cpuset); + while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) { + if (hwloc_bitmap_isset(nodeset, obj->os_index)) + /* no need to check obj->cpuset because objects in levels always have a cpuset */ + if (hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset) < 0) + return -1; + } + return 0; +} + +/** @} */ + + + +/** \defgroup hwlocality_advanced_io Finding I/O objects + * @{ + */ + +/** \brief Get the first non-I/O ancestor object. + * + * Given the I/O object \p ioobj, find the smallest non-I/O ancestor + * object. This object (normal or memory) may then be used for binding + * because it has non-NULL CPU and node sets + * and because its locality is the same as \p ioobj. + * + * \note The resulting object is usually a normal object but it could also + * be a memory object (e.g. NUMA node) in future platforms if I/O objects + * ever get attached to memory instead of CPUs. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused, + hwloc_obj_t ioobj) +{ + hwloc_obj_t obj = ioobj; + while (obj && !obj->cpuset) { + obj = obj->parent; + } + return obj; +} + +/** \brief Get the next PCI device in the system. + * + * \return the first PCI device if \p prev is \c NULL. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev) +{ + return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev); +} + +/** \brief Find the PCI device object matching the PCI bus id + * given domain, bus device and function PCI bus id. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_pcidev_by_busid(hwloc_topology_t topology, + unsigned domain, unsigned bus, unsigned dev, unsigned func) +{ + hwloc_obj_t obj = NULL; + while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) { + if (obj->attr->pcidev.domain == domain + && obj->attr->pcidev.bus == bus + && obj->attr->pcidev.dev == dev + && obj->attr->pcidev.func == func) + return obj; + } + return NULL; +} + +/** \brief Find the PCI device object matching the PCI bus id + * given as a string xxxx:yy:zz.t or yy:zz.t. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid) +{ + unsigned domain = 0; /* default */ + unsigned bus, dev, func; + + if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3 + && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) { + errno = EINVAL; + return NULL; + } + + return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func); +} + +/** \brief Get the next OS device in the system. + * + * \return the first OS device if \p prev is \c NULL. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev) +{ + return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev); +} + +/** \brief Get the next bridge in the system. + * + * \return the first bridge if \p prev is \c NULL. + */ +static __hwloc_inline hwloc_obj_t +hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev) +{ + return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev); +} + +/* \brief Checks whether a given bridge covers a given PCI bus. + */ +static __hwloc_inline int +hwloc_bridge_covers_pcibus(hwloc_obj_t bridge, + unsigned domain, unsigned bus) +{ + return bridge->type == HWLOC_OBJ_BRIDGE + && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI + && bridge->attr->bridge.downstream.pci.domain == domain + && bridge->attr->bridge.downstream.pci.secondary_bus <= bus + && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus; +} + +/** @} */ + + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_HELPER_H */ diff --git a/deps/hwloc/include/hwloc/inlines.h b/deps/hwloc/include/hwloc/inlines.h new file mode 100644 index 000000000..494209ea6 --- /dev/null +++ b/deps/hwloc/include/hwloc/inlines.h @@ -0,0 +1,146 @@ +/* + * Copyright © 2009 CNRS + * Copyright © 2009-2018 Inria. All rights reserved. + * Copyright © 2009-2012 Université Bordeaux + * Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved. + * See COPYING in top-level directory. + */ + +/** + * This file contains the inline code of functions declared in hwloc.h + */ + +#ifndef HWLOC_INLINES_H +#define HWLOC_INLINES_H + +#ifndef HWLOC_H +#error Please include the main hwloc.h instead +#endif + +#include <stdlib.h> +#include <errno.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +static __hwloc_inline int +hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) +{ + int depth = hwloc_get_type_depth(topology, type); + + if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) + return depth; + + /* find the highest existing level with type order >= */ + for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--) + if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0) + return depth+1; + + /* Shouldn't ever happen, as there is always a Machine level with lower order and known depth. */ + /* abort(); */ +} + +static __hwloc_inline int +hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) +{ + int depth = hwloc_get_type_depth(topology, type); + + if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) + return depth; + + /* find the lowest existing level with type order <= */ + for(depth = 0; ; depth++) + if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0) + return depth-1; + + /* Shouldn't ever happen, as there is always a PU level with higher order and known depth. */ + /* abort(); */ +} + +static __hwloc_inline int +hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) + return 0; + if (depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return -1; /* FIXME: agregate nbobjs from different levels? */ + return (int) hwloc_get_nbobjs_by_depth(topology, depth); +} + +static __hwloc_inline hwloc_obj_t +hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) + return NULL; + if (depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return NULL; + return hwloc_get_obj_by_depth(topology, depth, idx); +} + +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev) +{ + if (!prev) + return hwloc_get_obj_by_depth (topology, depth, 0); + if (prev->depth != depth) + return NULL; + return prev->next_cousin; +} + +static __hwloc_inline hwloc_obj_t +hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, + hwloc_obj_t prev) +{ + int depth = hwloc_get_type_depth(topology, type); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE) + return NULL; + return hwloc_get_next_obj_by_depth (topology, depth, prev); +} + +static __hwloc_inline hwloc_obj_t +hwloc_get_root_obj (hwloc_topology_t topology) +{ + return hwloc_get_obj_by_depth (topology, 0, 0); +} + +static __hwloc_inline const char * +hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) +{ + unsigned i; + for(i=0; i<obj->infos_count; i++) { + struct hwloc_info_s *info = &obj->infos[i]; + if (!strcmp(info->name, name)) + return info->value; + } + return NULL; +} + +static __hwloc_inline void * +hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags) +{ + void *p = hwloc_alloc_membind(topology, len, set, policy, flags); + if (p) + return p; + + if (hwloc_set_membind(topology, set, policy, flags) < 0) + /* hwloc_set_membind() takes care of ignoring errors if non-STRICT */ + return NULL; + + p = hwloc_alloc(topology, len); + if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH) + /* Enforce the binding by touching the data */ + memset(p, 0, len); + return p; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_INLINES_H */ diff --git a/deps/hwloc/include/hwloc/memattrs.h b/deps/hwloc/include/hwloc/memattrs.h new file mode 100644 index 000000000..2494abb08 --- /dev/null +++ b/deps/hwloc/include/hwloc/memattrs.h @@ -0,0 +1,455 @@ +/* + * Copyright © 2019-2020 Inria. All rights reserved. + * See COPYING in top-level directory. + */ + +/** \file + * \brief Memory node attributes. + */ + +#ifndef HWLOC_MEMATTR_H +#define HWLOC_MEMATTR_H + +#include "hwloc.h" + +#ifdef __cplusplus +extern "C" { +#elif 0 +} +#endif + +/** \defgroup hwlocality_memattrs Comparing memory node attributes for finding where to allocate on + * + * Platforms with heterogeneous memory require ways to decide whether + * a buffer should be allocated on "fast" memory (such as HBM), + * "normal" memory (DDR) or even "slow" but large-capacity memory + * (non-volatile memory). + * These memory nodes are called "Targets" while the CPU accessing them + * is called the "Initiator". Access performance depends on their + * locality (NUMA platforms) as well as the intrinsic performance + * of the targets (heterogeneous platforms). + * + * The following attributes describe the performance of memory accesses + * from an Initiator to a memory Target, for instance their latency + * or bandwidth. + * Initiators performing these memory accesses are usually some PUs or Cores + * (described as a CPU set). + * Hence a Core may choose where to allocate a memory buffer by comparing + * the attributes of different target memory nodes nearby. + * + * There are also some attributes that are system-wide. + * Their value does not depend on a specific initiator performing + * an access. + * The memory node Capacity is an example of such attribute without + * initiator. + * + * One way to use this API is to start with a cpuset describing the Cores where + * a program is bound. The best target NUMA node for allocating memory in this + * program on these Cores may be obtained by passing this cpuset as an initiator + * to hwloc_memattr_get_best_target() with the relevant memory attribute. + * For instance, if the code is latency limited, use the Latency attribute. + * + * A more flexible approach consists in getting the list of local NUMA nodes + * by passing this cpuset to hwloc_get_local_numanode_objs(). + * Attribute values for these nodes, if any, may then be obtained with + * hwloc_memattr_get_value() and manually compared with the desired criteria. + * + * \note The API also supports specific objects as initiator, + * but it is currently not used internally by hwloc. + * Users may for instance use it to provide custom performance + * values for host memory accesses performed by GPUs. + * + * \note The interface actually also accepts targets that are not NUMA nodes. + * @{ + */ + +/** \brief Memory node attributes. */ +enum hwloc_memattr_id_e { + /** \brief "Capacity". + * The capacity is returned in bytes + * (local_memory attribute in objects). + * + * Best capacity nodes are nodes with <b>higher capacity</b>. + * + * No initiator is involved when looking at this attribute. + * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST. + */ + HWLOC_MEMATTR_ID_CAPACITY = 0, + + /** \brief "Locality". + * The locality is returned as the number of PUs in that locality + * (e.g. the weight of its cpuset). + * + * Best locality nodes are nodes with <b>smaller locality</b> + * (nodes that are local to very few PUs). + * Poor locality nodes are nodes with larger locality + * (nodes that are local to the entire machine). + * + * No initiator is involved when looking at this attribute. + * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST. + */ + HWLOC_MEMATTR_ID_LOCALITY = 1, + + /** \brief "Bandwidth". + * The bandwidth is returned in MiB/s, as seen from the given initiator location. + * Best bandwidth nodes are nodes with <b>higher bandwidth</b>. + * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST + * and ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR. + */ + HWLOC_MEMATTR_ID_BANDWIDTH = 2, + + /** \brief "Latency". + * The latency is returned as nanoseconds, as seen from the given initiator location. + * Best latency nodes are nodes with <b>smaller latency</b>. + * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_LOWER_FIRST + * and ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR. + */ + HWLOC_MEMATTR_ID_LATENCY = 3 + + /* TODO read vs write, persistence? */ +}; + +/** \brief A memory attribute identifier. + * May be either one of ::hwloc_memattr_id_e or a new id returned by hwloc_memattr_register(). + */ +typedef unsigned hwloc_memattr_id_t; + +/** \brief Return the identifier of the memory attribute with the given name. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_by_name(hwloc_topology_t topology, + const char *name, + hwloc_memattr_id_t *id); + + +/** \brief Type of location. */ +enum hwloc_location_type_e { + /** \brief Location is given as a cpuset, in the location cpuset union field. \hideinitializer */ + HWLOC_LOCATION_TYPE_CPUSET = 1, + /** \brief Location is given as an object, in the location object union field. \hideinitializer */ + HWLOC_LOCATION_TYPE_OBJECT = 0 +}; + +/** \brief Where to measure attributes from. */ +struct hwloc_location { + /** \brief Type of location. */ + enum hwloc_location_type_e type; + /** \brief Actual location. */ + union hwloc_location_u { + /** \brief Location as a cpuset, when the location type is ::HWLOC_LOCATION_TYPE_CPUSET. */ + hwloc_cpuset_t cpuset; + /** \brief Location as an object, when the location type is ::HWLOC_LOCATION_TYPE_OBJECT. */ + hwloc_obj_t object; + } location; +}; + + +/** \brief Flags for selecting target NUMA nodes. */ +enum hwloc_local_numanode_flag_e { + /** \brief Select NUMA nodes whose locality is larger than the given cpuset. + * For instance, if a single PU (or its cpuset) is given in \p initiator, + * select all nodes close to the package that contains this PU. + * \hideinitializer + */ + HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY = (1UL<<0), + + /** \brief Select NUMA nodes whose locality is smaller than the given cpuset. + * For instance, if a package (or its cpuset) is given in \p initiator, + * also select nodes that are attached to only a half of that package. + * \hideinitializer + */ + HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY = (1UL<<1), + + /** \brief Select all NUMA nodes in the topology. + * The initiator \p initiator is ignored. + * \hideinitializer + */ + HWLOC_LOCAL_NUMANODE_FLAG_ALL = (1UL<<2) +}; + +/** \brief Return an array of local NUMA nodes. + * + * By default only select the NUMA nodes whose locality is exactly + * the given \p location. More nodes may be selected if additional flags + * are given as a OR'ed set of ::hwloc_local_numanode_flag_e. + * + * If \p location is given as an explicit object, its CPU set is used + * to find NUMA nodes with the corresponding locality. + * If the object does not have a CPU set (e.g. I/O object), the CPU + * parent (where the I/O object is attached) is used. + * + * On input, \p nr points to the number of nodes that may be stored + * in the \p nodes array. + * On output, \p nr will be changed to the number of stored nodes, + * or the number of nodes that would have been stored if there were + * enough room. + * + * \note Some of these NUMA nodes may not have any memory attribute + * values and hence not be reported as actual targets in other functions. + * + * \note The number of NUMA nodes in the topology (obtained by + * hwloc_bitmap_weight() on the root object nodeset) may be used + * to allocate the \p nodes array. + * + * \note When an object CPU set is given as locality, for instance a Package, + * and when flags contain both ::HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY + * and ::HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY, + * the returned array corresponds to the nodeset of that object. + */ +HWLOC_DECLSPEC int +hwloc_get_local_numanode_objs(hwloc_topology_t topology, + struct hwloc_location *location, + unsigned *nr, + hwloc_obj_t *nodes, + unsigned long flags); + + + +/** \brief Return an attribute value for a specific target NUMA node. + * + * If the attribute does not relate to a specific initiator + * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR), + * location \p initiator is ignored and may be \c NULL. + * + * \p flags must be \c 0 for now. + * + * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET + * when refering to accesses performed by CPU cores. + * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc, + * but users may for instance use it to provide custom information about + * host memory accesses performed by GPUs. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_value(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + hwloc_obj_t target_node, + struct hwloc_location *initiator, + unsigned long flags, + hwloc_uint64_t *value); + +/** \brief Return the best target NUMA node for the given attribute and initiator. + * + * If the attribute does not relate to a specific initiator + * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR), + * location \p initiator is ignored and may be \c NULL. + * + * If \p value is non \c NULL, the corresponding value is returned there. + * + * If multiple targets have the same attribute values, only one is + * returned (and there is no way to clarify how that one is chosen). + * Applications that want to detect targets with identical/similar + * values, or that want to look at values for multiple attributes, + * should rather get all values using hwloc_memattr_get_value() + * and manually select the target they consider the best. + * + * \p flags must be \c 0 for now. + * + * If there are no matching targets, \c -1 is returned with \p errno set to \c ENOENT; + * + * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET + * when refering to accesses performed by CPU cores. + * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc, + * but users may for instance use it to provide custom information about + * host memory accesses performed by GPUs. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_best_target(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + struct hwloc_location *initiator, + unsigned long flags, + hwloc_obj_t *best_target, hwloc_uint64_t *value); + +/** \brief Return the best initiator for the given attribute and target NUMA node. + * + * If the attribute does not relate to a specific initiator + * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR), + * \c -1 is returned and \p errno is set to \c EINVAL. + * + * If \p value is non \c NULL, the corresponding value is returned there. + * + * If multiple initiators have the same attribute values, only one is + * returned (and there is no way to clarify how that one is chosen). + * Applications that want to detect initiators with identical/similar + * values, or that want to look at values for multiple attributes, + * should rather get all values using hwloc_memattr_get_value() + * and manually select the initiator they consider the best. + * + * The returned initiator should not be modified or freed, + * it belongs to the topology. + * + * \p flags must be \c 0 for now. + * + * If there are no matching initiators, \c -1 is returned with \p errno set to \c ENOENT; + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_best_initiator(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + hwloc_obj_t target, + unsigned long flags, + struct hwloc_location *best_initiator, hwloc_uint64_t *value); + +/** @} */ + + +/** \defgroup hwlocality_memattrs_manage Managing memory attributes + * @{ + */ + +/** \brief Return the name of a memory attribute. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_name(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + const char **name); + +/** \brief Return the flags of the given attribute. + * + * Flags are a OR'ed set of ::hwloc_memattr_flag_e. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_flags(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + unsigned long *flags); + +/** \brief Memory attribute flags. + * Given to hwloc_memattr_register() and returned by hwloc_memattr_get_flags(). + */ +enum hwloc_memattr_flag_e { + /** \brief The best nodes for this memory attribute are those with the higher values. + * For instance Bandwidth. + */ + HWLOC_MEMATTR_FLAG_HIGHER_FIRST = (1UL<<0), + /** \brief The best nodes for this memory attribute are those with the lower values. + * For instance Latency. + */ + HWLOC_MEMATTR_FLAG_LOWER_FIRST = (1UL<<1), + /** \brief The value returned for this memory attribute depends on the given initiator. + * For instance Bandwidth and Latency, but not Capacity. + */ + HWLOC_MEMATTR_FLAG_NEED_INITIATOR = (1UL<<2) +}; + +/** \brief Register a new memory attribute. + * + * Add a specific memory attribute that is not defined in ::hwloc_memattr_id_e. + * Flags are a OR'ed set of ::hwloc_memattr_flag_e. It must contain at least + * one of ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST or ::HWLOC_MEMATTR_FLAG_LOWER_FIRST. + */ +HWLOC_DECLSPEC int +hwloc_memattr_register(hwloc_topology_t topology, + const char *name, + unsigned long flags, + hwloc_memattr_id_t *id); + +/** \brief Set an attribute value for a specific target NUMA node. + * + * If the attribute does not relate to a specific initiator + * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR), + * location \p initiator is ignored and may be \c NULL. + * + * The initiator will be copied into the topology, + * the caller should free anything allocated to store the initiator, + * for instance the cpuset. + * + * \p flags must be \c 0 for now. + * + * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET + * when refering to accesses performed by CPU cores. + * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc, + * but users may for instance use it to provide custom information about + * host memory accesses performed by GPUs. + */ +HWLOC_DECLSPEC int +hwloc_memattr_set_value(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + hwloc_obj_t target_node, + struct hwloc_location *initiator, + unsigned long flags, + hwloc_uint64_t value); + +/** \brief Return the target NUMA nodes that have some values for a given attribute. + * + * Return targets for the given attribute in the \p targets array + * (for the given initiator if any). + * If \p values is not \c NULL, the corresponding attribute values + * are stored in the array it points to. + * + * On input, \p nr points to the number of targets that may be stored + * in the array \p targets (and \p values). + * On output, \p nr points to the number of targets (and values) that + * were actually found, even if some of them couldn't be stored in the array. + * Targets that couldn't be stored are ignored, but the function still + * returns success (\c 0). The caller may find out by comparing the value pointed + * by \p nr before and after the function call. + * + * The returned targets should not be modified or freed, + * they belong to the topology. + * + * Argument \p initiator is ignored if the attribute does not relate to a specific + * initiator (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR). + * Otherwise \p initiator may be non \c NULL to report only targets + * that have a value for that initiator. + * + * \p flags must be \c 0 for now. + * + * \note This function is meant for tools and debugging (listing internal information) + * rather than for application queries. Applications should rather select useful + * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute + * values. + * + * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET + * when refering to accesses performed by CPU cores. + * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc, + * but users may for instance use it to provide custom information about + * host memory accesses performed by GPUs. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_targets(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + struct hwloc_location *initiator, + unsigned long flags, + unsigned *nrp, hwloc_obj_t *targets, hwloc_uint64_t *values); + +/** \brief Return the initiators that have values for a given attribute for a specific target NUMA node. + * + * Return initiators for the given attribute and target node in the + * \p initiators array. + * If \p values is not \c NULL, the corresponding attribute values + * are stored in the array it points to. + * + * On input, \p nr points to the number of initiators that may be stored + * in the array \p initiators (and \p values). + * On output, \p nr points to the number of initiators (and values) that + * were actually found, even if some of them couldn't be stored in the array. + * Initiators that couldn't be stored are ignored, but the function still + * returns success (\c 0). The caller may find out by comparing the value pointed + * by \p nr before and after the function call. + * + * The returned initiators should not be modified or freed, + * they belong to the topology. + * + * \p flags must be \c 0 for now. + * + * If the attribute does not relate to a specific initiator + * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR), + * no initiator is returned. + * + * \note This function is meant for tools and debugging (listing internal information) + * rather than for application queries. Applications should rather select useful + * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute + * values for some relevant initiators. + */ +HWLOC_DECLSPEC int +hwloc_memattr_get_initiators(hwloc_topology_t topology, + hwloc_memattr_id_t attribute, + hwloc_obj_t target_node, + unsigned long flags, + unsigned *nr, struct hwloc_location *initiators, hwloc_uint64_t *values); +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_MEMATTR_H */ diff --git a/deps/hwloc/include/hwloc/rename.h b/deps/hwloc/include/hwloc/rename.h new file mode 100644 index 000000000..9f3d5f60c --- /dev/null +++ b/deps/hwloc/include/hwloc/rename.h @@ -0,0 +1,896 @@ +/* + * Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright © 2010-2021 Inria. All rights reserved. + * See COPYING in top-level directory. + */ + +#ifndef HWLOC_RENAME_H +#define HWLOC_RENAME_H + +#include "hwloc/autogen/config.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Only enact these defines if we're actually renaming the symbols + (i.e., avoid trying to have no-op defines if we're *not* + renaming). */ + +#if HWLOC_SYM_TRANSFORM + +/* Use a preprocessor two-step in order to get the prefixing right. + Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming + things. */ + +#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b) +#define HWLOC_MUNGE_NAME2(a, b) a ## b +#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name) +/* FIXME: should be "HWLOC_ ## name" below, unchanged because it doesn't matter much and could break some embedders hacks */ +#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name) + +/* Now define all the "real" names to be the prefixed names. This + allows us to use the real names throughout the code base (i.e., + "hwloc_<foo>"); the preprocessor will adjust to have the prefixed + name under the covers. */ + +/* Names from hwloc.h */ + +#define hwloc_get_api_version HWLOC_NAME(get_api_version) + +#define hwloc_topology HWLOC_NAME(topology) +#define hwloc_topology_t HWLOC_NAME(topology_t) + +#define hwloc_cpuset_t HWLOC_NAME(cpuset_t) +#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t) +#define hwloc_nodeset_t HWLOC_NAME(nodeset_t) +#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t) + +#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE) +#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE) +#define HWLOC_OBJ_MEMCACHE HWLOC_NAME_CAPS(OBJ_MEMCACHE) +#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE) +#define HWLOC_OBJ_DIE HWLOC_NAME_CAPS(OBJ_DIE) +#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE) +#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU) +#define HWLOC_OBJ_L1CACHE HWLOC_NAME_CAPS(OBJ_L1CACHE) +#define HWLOC_OBJ_L2CACHE HWLOC_NAME_CAPS(OBJ_L2CACHE) +#define HWLOC_OBJ_L3CACHE HWLOC_NAME_CAPS(OBJ_L3CACHE) +#define HWLOC_OBJ_L4CACHE HWLOC_NAME_CAPS(OBJ_L4CACHE) +#define HWLOC_OBJ_L5CACHE HWLOC_NAME_CAPS(OBJ_L5CACHE) +#define HWLOC_OBJ_L1ICACHE HWLOC_NAME_CAPS(OBJ_L1ICACHE) +#define HWLOC_OBJ_L2ICACHE HWLOC_NAME_CAPS(OBJ_L2ICACHE) +#define HWLOC_OBJ_L3ICACHE HWLOC_NAME_CAPS(OBJ_L3ICACHE) +#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC) +#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP) +#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE) +#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE) +#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE) +#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX) +#define hwloc_obj_type_t HWLOC_NAME(obj_type_t) + +#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e) +#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t) +#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED) +#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA) +#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION) + +#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e) +#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t) +#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST) +#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI) + +#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e) +#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t) +#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK) +#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU) +#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK) +#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS) +#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA) +#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC) + +#define hwloc_compare_types HWLOC_NAME(compare_types) + +#define hwloc_obj HWLOC_NAME(obj) +#define hwloc_obj_t HWLOC_NAME(obj_t) + +#define hwloc_info_s HWLOC_NAME(info_s) + +#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u) +#define hwloc_numanode_attr_s HWLOC_NAME(numanode_attr_s) +#define hwloc_memory_page_type_s HWLOC_NAME(memory_page_type_s) +#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s) +#define hwloc_group_attr_s HWLOC_NAME(group_attr_s) +#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s) +#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s) +#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s) + +#define hwloc_topology_init HWLOC_NAME(topology_init) +#define hwloc_topology_load HWLOC_NAME(topology_load) +#define hwloc_topology_destroy HWLOC_NAME(topology_destroy) +#define hwloc_topology_dup HWLOC_NAME(topology_dup) +#define hwloc_topology_abi_check HWLOC_NAME(topology_abi_check) +#define hwloc_topology_check HWLOC_NAME(topology_check) + +#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e) + +#define HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WITH_DISALLOWED) +#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM) +#define HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES) +#define HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IMPORT_SUPPORT) + +#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid) +#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic) +#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml) +#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer) +#define hwloc_topology_components_flag_e HWLOC_NAME(hwloc_topology_components_flag_e) +#define HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST HWLOC_NAME_CAPS(TOPOLOGY_COMPONENTS_FLAG_BLACKLIST) +#define hwloc_topology_set_components HWLOC_NAME(topology_set_components) + +#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags) +#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem) +#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags) +#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support) +#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support) +#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support) +#define hwloc_topology_misc_support HWLOC_NAME(topology_misc_support) +#define hwloc_topology_support HWLOC_NAME(topology_support) +#define hwloc_topology_get_support HWLOC_NAME(topology_get_support) + +#define hwloc_type_filter_e HWLOC_NAME(type_filter_e) +#define HWLOC_TYPE_FILTER_KEEP_ALL HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_ALL) +#define HWLOC_TYPE_FILTER_KEEP_NONE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_NONE) +#define HWLOC_TYPE_FILTER_KEEP_STRUCTURE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_STRUCTURE) +#define HWLOC_TYPE_FILTER_KEEP_IMPORTANT HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_IMPORTANT) +#define hwloc_topology_set_type_filter HWLOC_NAME(topology_set_type_filter) +#define hwloc_topology_get_type_filter HWLOC_NAME(topology_get_type_filter) +#define hwloc_topology_set_all_types_filter HWLOC_NAME(topology_set_all_types_filter) +#define hwloc_topology_set_cache_types_filter HWLOC_NAME(topology_set_cache_types_filter) +#define hwloc_topology_set_icache_types_filter HWLOC_NAME(topology_set_icache_types_filter) +#define hwloc_topology_set_io_types_filter HWLOC_NAME(topology_set_io_types_filter) + +#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata) +#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata) + +#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e) +#define HWLOC_RESTRICT_FLAG_REMOVE_CPULESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_CPULESS) +#define HWLOC_RESTRICT_FLAG_BYNODESET HWLOC_NAME_CAPS(RESTRICT_FLAG_BYNODESET) +#define HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_MEMLESS) +#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC) +#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO) +#define hwloc_topology_restrict HWLOC_NAME(topology_restrict) + +#define hwloc_allow_flags_e HWLOC_NAME(allow_flags_e) +#define HWLOC_ALLOW_FLAG_ALL HWLOC_NAME_CAPS(ALLOW_FLAG_ALL) +#define HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS HWLOC_NAME_CAPS(ALLOW_FLAG_LOCAL_RESTRICTIONS) +#define HWLOC_ALLOW_FLAG_CUSTOM HWLOC_NAME_CAPS(ALLOW_FLAG_CUSTOM) +#define hwloc_topology_allow HWLOC_NAME(topology_allow) + +#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object) +#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object) +#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object) +#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets) +#define hwloc_topology_refresh HWLOC_NAME(topology_refresh) + +#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth) +#define hwloc_get_type_depth HWLOC_NAME(get_type_depth) +#define hwloc_get_memory_parents_depth HWLOC_NAME(get_memory_parents_depth) + +#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e) +#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN) +#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE) +#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE) +#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE) +#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE) +#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC) +#define HWLOC_TYPE_DEPTH_NUMANODE HWLOC_NAME_CAPS(TYPE_DEPTH_NUMANODE) +#define HWLOC_TYPE_DEPTH_MEMCACHE HWLOC_NAME_CAPS(TYPE_DEPTH_MEMCACHE) + +#define hwloc_get_depth_type HWLOC_NAME(get_depth_type) +#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth) +#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type) + +#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth ) +#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type ) + +#define hwloc_obj_type_string HWLOC_NAME(obj_type_string ) +#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf ) +#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf ) +#define hwloc_type_sscanf HWLOC_NAME(type_sscanf) +#define hwloc_type_sscanf_as_depth HWLOC_NAME(type_sscanf_as_depth) + +#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name) +#define hwloc_obj_add_info HWLOC_NAME(obj_add_info) + +#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS) +#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD) +#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT) +#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND) + +#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t) + +#define hwloc_set_cpubind HWLOC_NAME(set_cpubind) +#define hwloc_get_cpubind HWLOC_NAME(get_cpubind) +#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind) +#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind) +#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind) +#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind) + +#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location) +#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location) + +#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT) +#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH) +#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND) +#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE) +#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH) +#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED) + +#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t) + +#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS) +#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD) +#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT) +#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE) +#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND) +#define HWLOC_MEMBIND_BYNODESET HWLOC_NAME_CAPS(MEMBIND_BYNODESET) + +#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t) + +#define hwloc_set_membind HWLOC_NAME(set_membind) +#define hwloc_get_membind HWLOC_NAME(get_membind) +#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind) +#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind) +#define hwloc_set_area_membind HWLOC_NAME(set_area_membind) +#define hwloc_get_area_membind HWLOC_NAME(get_area_membind) +#define hwloc_get_area_memlocation HWLOC_NAME(get_area_memlocation) +#define hwloc_alloc_membind HWLOC_NAME(alloc_membind) +#define hwloc_alloc HWLOC_NAME(alloc) +#define hwloc_free HWLOC_NAME(free) + +#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj) +#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev) +#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid) +#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring) +#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev) +#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge) +#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus) + +/* hwloc/bitmap.h */ + +#define hwloc_bitmap_s HWLOC_NAME(bitmap_s) +#define hwloc_bitmap_t HWLOC_NAME(bitmap_t) +#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t) + +#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc) +#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full) +#define hwloc_bitmap_free HWLOC_NAME(bitmap_free) +#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup) +#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy) +#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf) +#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf) +#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf) +#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf) +#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf) +#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf) +#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf) +#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf) +#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf) +#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero) +#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill) +#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong) +#define hwloc_bitmap_from_ulongs HWLOC_NAME(bitmap_from_ulongs) +#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong) +#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong) +#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong) +#define hwloc_bitmap_to_ulongs HWLOC_NAME(bitmap_to_ulongs) +#define hwloc_bitmap_nr_ulongs HWLOC_NAME(bitmap_nr_ulongs) +#define hwloc_bitmap_only HWLOC_NAME(bitmap_only) +#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut) +#define hwloc_bitmap_set HWLOC_NAME(bitmap_set) +#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range) +#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong) +#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr) +#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range) +#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset) +#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero) +#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull) +#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal) +#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects) +#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded) +#define hwloc_bitmap_or HWLOC_NAME(bitmap_or) +#define hwloc_bitmap_and HWLOC_NAME(bitmap_and) +#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot) +#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor) +#define hwloc_bitmap_not HWLOC_NAME(bitmap_not) +#define hwloc_bitmap_first HWLOC_NAME(bitmap_first) +#define hwloc_bitmap_last HWLOC_NAME(bitmap_last) +#define hwloc_bitmap_next HWLOC_NAME(bitmap_next) +#define hwloc_bitmap_first_unset HWLOC_NAME(bitmap_first_unset) +#define hwloc_bitmap_last_unset HWLOC_NAME(bitmap_last_unset) +#define hwloc_bitmap_next_unset HWLOC_NAME(bitmap_next_unset) +#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify) +#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first) +#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare) +#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight) + +/* hwloc/helper.h */ + +#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth) +#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth) +#define hwloc_get_root_obj HWLOC_NAME(get_root_obj) +#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth) +#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type) +#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth) +#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type) +#define hwloc_bitmap_singlify_per_core HWLOC_NAME(bitmap_singlify_by_core) +#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index) +#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index) +#define hwloc_get_next_child HWLOC_NAME(get_next_child) +#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj) +#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree) +#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset) +#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset) +#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth) +#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type) +#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth) +#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type) +#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth) +#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type) +#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset) +#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset) +#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset) +#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth) +#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type) +#define hwloc_obj_type_is_normal HWLOC_NAME(obj_type_is_normal) +#define hwloc_obj_type_is_memory HWLOC_NAME(obj_type_is_memory) +#define hwloc_obj_type_is_io HWLOC_NAME(obj_type_is_io) +#define hwloc_obj_type_is_cache HWLOC_NAME(obj_type_is_cache) +#define hwloc_obj_type_is_dcache HWLOC_NAME(obj_type_is_dcache) +#define hwloc_obj_type_is_icache HWLOC_NAME(obj_type_is_icache) +#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth) +#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset) +#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj) +#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs) +#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type) +#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type) +#define hwloc_get_obj_with_same_locality HWLOC_NAME(get_obj_with_same_locality) +#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e) +#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE) +#define hwloc_distrib HWLOC_NAME(distrib) +#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy) +#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset) +#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset) +#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset) +#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset) +#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset) +#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset) +#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset) +#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset) +#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset) + +/* memattrs.h */ + +#define hwloc_memattr_id_e HWLOC_NAME(memattr_id_e) +#define HWLOC_MEMATTR_ID_CAPACITY HWLOC_NAME_CAPS(MEMATTR_ID_CAPACITY) +#define HWLOC_MEMATTR_ID_LOCALITY HWLOC_NAME_CAPS(MEMATTR_ID_LOCALITY) +#define HWLOC_MEMATTR_ID_BANDWIDTH HWLOC_NAME_CAPS(MEMATTR_ID_BANDWIDTH) +#define HWLOC_MEMATTR_ID_LATENCY HWLOC_NAME_CAPS(MEMATTR_ID_LATENCY) + +#define hwloc_memattr_id_t HWLOC_NAME(memattr_id_t) +#define hwloc_memattr_get_by_name HWLOC_NAME(memattr_get_by_name) + +#define hwloc_location HWLOC_NAME(location) +#define hwloc_location_type_e HWLOC_NAME(location_type_e) +#define HWLOC_LOCATION_TYPE_OBJECT HWLOC_NAME_CAPS(LOCATION_TYPE_OBJECT) +#define HWLOC_LOCATION_TYPE_CPUSET HWLOC_NAME_CAPS(LOCATION_TYPE_CPUSET) +#define hwloc_location_u HWLOC_NAME(location_u) + +#define hwloc_memattr_get_value HWLOC_NAME(memattr_get_value) +#define hwloc_memattr_get_best_target HWLOC_NAME(memattr_get_best_target) +#define hwloc_memattr_get_best_initiator HWLOC_NAME(memattr_get_best_initiator) + +#define hwloc_local_numanode_flag_e HWLOC_NAME(local_numanode_flag_e) +#define HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_LARGER_LOCALITY) +#define HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY) +#define HWLOC_LOCAL_NUMANODE_FLAG_ALL HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_ALL) +#define hwloc_get_local_numanode_objs HWLOC_NAME(get_local_numanode_objs) + +#define hwloc_memattr_get_name HWLOC_NAME(memattr_get_name) +#define hwloc_memattr_get_flags HWLOC_NAME(memattr_get_flags) +#define hwloc_memattr_flag_e HWLOC_NAME(memattr_flag_e) +#define HWLOC_MEMATTR_FLAG_HIGHER_FIRST HWLOC_NAME_CAPS(MEMATTR_FLAG_HIGHER_FIRST) +#define HWLOC_MEMATTR_FLAG_LOWER_FIRST HWLOC_NAME_CAPS(MEMATTR_FLAG_LOWER_FIRST) +#define HWLOC_MEMATTR_FLAG_NEED_INITIATOR HWLOC_NAME_CAPS(MEMATTR_FLAG_NEED_INITIATOR) +#define hwloc_memattr_register HWLOC_NAME(memattr_register) +#define hwloc_memattr_set_value HWLOC_NAME(memattr_set_value) +#define hwloc_memattr_get_targets HWLOC_NAME(memattr_get_targets) +#define hwloc_memattr_get_initiators HWLOC_NAME(memattr_get_initiators) + +/* cpukinds.h */ + +#define hwloc_cpukinds_get_nr HWLOC_NAME(cpukinds_get_nr) +#define hwloc_cpukinds_get_by_cpuset HWLOC_NAME(cpukinds_get_by_cpuset) +#define hwloc_cpukinds_get_info HWLOC_NAME(cpukinds_get_info) +#define hwloc_cpukinds_register HWLOC_NAME(cpukinds_register) + +/* export.h */ + +#define hwloc_topology_export_xml_flags_e HWLOC_NAME(topology_export_xml_flags_e) +#define HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_XML_FLAG_V1) +#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml) +#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer) +#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer) +#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback) +#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata) +#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64) +#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback) + +#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e) +#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) +#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS) +#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1) +#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY) +#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic) + +/* distances.h */ + +#define hwloc_distances_s HWLOC_NAME(distances_s) + +#define hwloc_distances_kind_e HWLOC_NAME(distances_kind_e) +#define HWLOC_DISTANCES_KIND_FROM_OS HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_OS) +#define HWLOC_DISTANCES_KIND_FROM_USER HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_USER) +#define HWLOC_DISTANCES_KIND_MEANS_LATENCY HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_LATENCY) +#define HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_BANDWIDTH) +#define HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES HWLOC_NAME_CAPS(DISTANCES_KIND_HETEROGENEOUS_TYPES) + +#define hwloc_distances_get HWLOC_NAME(distances_get) +#define hwloc_distances_get_by_depth HWLOC_NAME(distances_get_by_depth) +#define hwloc_distances_get_by_type HWLOC_NAME(distances_get_by_type) +#define hwloc_distances_get_by_name HWLOC_NAME(distances_get_by_name) +#define hwloc_distances_get_name HWLOC_NAME(distances_get_name) +#define hwloc_distances_release HWLOC_NAME(distances_release) +#define hwloc_distances_obj_index HWLOC_NAME(distances_obj_index) +#define hwloc_distances_obj_pair_values HWLOC_NAME(distances_pair_values) + +#define hwloc_distances_transform_e HWLOC_NAME(distances_transform_e) +#define HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL HWLOC_NAME_CAPS(DISTANCES_TRANSFORM_REMOVE_NULL) +#define HWLOC_DISTANCES_TRANSFORM_LINKS HWLOC_NAME_CAPS(DISTANCES_TRANSFORM_LINKS) +#define hwloc_distances_transform HWLOC_NAME(distances_transform) + +#define hwloc_distances_add_flag_e HWLOC_NAME(distances_add_flag_e) +#define HWLOC_DISTANCES_ADD_FLAG_GROUP HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP) +#define HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP_INACCURATE) + +#define hwloc_distances_add_handle_t HWLOC_NAME(distances_add_handle_t) +#define hwloc_distances_add_create HWLOC_NAME(distances_add_create) +#define hwloc_distances_add_values HWLOC_NAME(distances_add_values) +#define hwloc_distances_add_commit HWLOC_NAME(distances_add_commit) + +#define hwloc_distances_remove HWLOC_NAME(distances_remove) +#define hwloc_distances_remove_by_depth HWLOC_NAME(distances_remove_by_depth) +#define hwloc_distances_remove_by_type HWLOC_NAME(distances_remove_by_type) +#define hwloc_distances_release_remove HWLOC_NAME(distances_release_remove) + +/* diff.h */ + +#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e) +#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t) +#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE) +#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME) +#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO) +#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u) +#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s) +#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s) +#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s) +#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e) +#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t) +#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR) +#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX) +#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u) +#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t) +#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s) +#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s) +#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s) +#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build) +#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e) +#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE) +#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply) +#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy) +#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml) +#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml) +#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer) +#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer) + +/* shmem.h */ + +#define hwloc_shmem_topology_get_length HWLOC_NAME(shmem_topology_get_length) +#define hwloc_shmem_topology_write HWLOC_NAME(shmem_topology_write) +#define hwloc_shmem_topology_adopt HWLOC_NAME(shmem_topology_adopt) + +/* glibc-sched.h */ + +#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity) +#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity) + +/* linux-libnuma.h */ + +#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs) +#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs) +#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs) +#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs) +#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask) +#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask) +#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask) +#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask) + +/* linux.h */ + +#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind) +#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind) +#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location) +#define hwloc_linux_read_path_as_cpumask HWLOC_NAME(linux_read_file_cpumask) + +/* windows.h */ + +#define hwloc_windows_get_nr_processor_groups HWLOC_NAME(windows_get_nr_processor_groups) +#define hwloc_windows_get_processor_group_cpuset HWLOC_NAME(windows_get_processor_group_cpuset) + +/* openfabrics-verbs.h */ + +#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset) +#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev) +#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name) + +/* opencl.h */ + +#define hwloc_cl_device_topology_amd HWLOC_NAME(cl_device_topology_amd) +#define hwloc_opencl_get_device_pci_busid HWLOC_NAME(opencl_get_device_pci_ids) +#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset) +#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev) +#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index) + +/* cuda.h */ + +#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids) +#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset) +#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev) +#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev) +#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index) + +/* cudart.h */ + +#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids) +#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset) +#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev) +#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index) + +/* nvml.h */ + +#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset) +#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev) +#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index) + +/* rsmi.h */ + +#define hwloc_rsmi_get_device_cpuset HWLOC_NAME(rsmi_get_device_cpuset) +#define hwloc_rsmi_get_device_osdev HWLOC_NAME(rsmi_get_device_osdev) +#define hwloc_rsmi_get_device_osdev_by_index HWLOC_NAME(rsmi_get_device_osdev_by_index) + +/* levelzero.h */ + +#define hwloc_levelzero_get_device_cpuset HWLOC_NAME(levelzero_get_device_cpuset) +#define hwloc_levelzero_get_device_osdev HWLOC_NAME(levelzero_get_device_osdev) + +/* gl.h */ + +#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device) +#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name) +#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev) + +/* hwloc/plugins.h */ + +#define hwloc_disc_phase_e HWLOC_NAME(disc_phase_e) +#define HWLOC_DISC_PHASE_GLOBAL HWLOC_NAME_CAPS(DISC_PHASE_GLOBAL) +#define HWLOC_DISC_PHASE_CPU HWLOC_NAME_CAPS(DISC_PHASE_CPU) +#define HWLOC_DISC_PHASE_MEMORY HWLOC_NAME_CAPS(DISC_PHASE_MEMORY) +#define HWLOC_DISC_PHASE_PCI HWLOC_NAME_CAPS(DISC_PHASE_PCI) +#define HWLOC_DISC_PHASE_IO HWLOC_NAME_CAPS(DISC_PHASE_IO) +#define HWLOC_DISC_PHASE_MISC HWLOC_NAME_CAPS(DISC_PHASE_MISC) +#define HWLOC_DISC_PHASE_ANNOTATE HWLOC_NAME_CAPS(DISC_PHASE_ANNOTATE) +#define HWLOC_DISC_PHASE_TWEAK HWLOC_NAME_CAPS(DISC_PHASE_TWEAK) +#define hwloc_disc_phase_t HWLOC_NAME(disc_phase_t) +#define hwloc_disc_component HWLOC_NAME(disc_component) + +#define hwloc_disc_status_flag_e HWLOC_NAME(disc_status_flag_e) +#define HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES HWLOC_NAME_CAPS(DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES) +#define hwloc_disc_status HWLOC_NAME(disc_status) + +#define hwloc_backend HWLOC_NAME(backend) + +#define hwloc_backend_alloc HWLOC_NAME(backend_alloc) +#define hwloc_backend_enable HWLOC_NAME(backend_enable) + +#define hwloc_component_type_e HWLOC_NAME(component_type_e) +#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC) +#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML) +#define hwloc_component_type_t HWLOC_NAME(component_type_t) +#define hwloc_component HWLOC_NAME(component) + +#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace) + +#define hwloc_hide_errors HWLOC_NAME(hide_errors) +#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset) +#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent) +#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object) +#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets) +#define hwloc_topology_reconnect HWLOC_NAME(topology_reconnect) + +#define hwloc_filter_check_pcidev_subtype_important HWLOC_NAME(filter_check_pcidev_subtype_important) +#define hwloc_filter_check_osdev_subtype_important HWLOC_NAME(filter_check_osdev_subtype_important) +#define hwloc_filter_check_keep_object_type HWLOC_NAME(filter_check_keep_object_type) +#define hwloc_filter_check_keep_object HWLOC_NAME(filter_check_keep_object) + +#define hwloc_pcidisc_find_cap HWLOC_NAME(pcidisc_find_cap) +#define hwloc_pcidisc_find_linkspeed HWLOC_NAME(pcidisc_find_linkspeed) +#define hwloc_pcidisc_check_bridge_type HWLOC_NAME(pcidisc_check_bridge_type) +#define hwloc_pcidisc_find_bridge_buses HWLOC_NAME(pcidisc_find_bridge_buses) +#define hwloc_pcidisc_tree_insert_by_busid HWLOC_NAME(pcidisc_tree_insert_by_busid) +#define hwloc_pcidisc_tree_attach HWLOC_NAME(pcidisc_tree_attach) + +#define hwloc_pci_find_by_busid HWLOC_NAME(pcidisc_find_by_busid) +#define hwloc_pci_find_parent_by_busid HWLOC_NAME(pcidisc_find_busid_parent) + +#define hwloc_backend_distances_add_handle_t HWLOC_NAME(backend_distances_add_handle_t) +#define hwloc_backend_distances_add_create HWLOC_NAME(backend_distances_add_create) +#define hwloc_backend_distances_add_values HWLOC_NAME(backend_distances_add_values) +#define hwloc_backend_distances_add_commit HWLOC_NAME(backend_distances_add_commit) + +/* hwloc/deprecated.h */ + +#define hwloc_distances_add HWLOC_NAME(distances_add) + +#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent) +#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf) +#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf) + +#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset) +#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset) +#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset) +#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset) +#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset) +#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset) +#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset) + +#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict) +#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict) + +/* private/debug.h */ + +#define hwloc_debug_enabled HWLOC_NAME(debug_enabled) +#define hwloc_debug HWLOC_NAME(debug) + +/* private/misc.h */ + +#ifndef HWLOC_HAVE_CORRECT_SNPRINTF +#define hwloc_snprintf HWLOC_NAME(snprintf) +#endif +#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual) +#define hwloc_ffs32 HWLOC_NAME(ffs32) +#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32) +#define hwloc_flsl_manual HWLOC_NAME(flsl_manual) +#define hwloc_fls32 HWLOC_NAME(fls32) +#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32) +#define hwloc_weight_long HWLOC_NAME(weight_long) +#define hwloc_strncasecmp HWLOC_NAME(strncasecmp) + +#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion) + +#define hwloc_pci_class_string HWLOC_NAME(pci_class_string) +#define hwloc_linux_pci_link_speed_from_string HWLOC_NAME(linux_pci_link_speed_from_string) + +#define hwloc_cache_type_by_depth_type HWLOC_NAME(cache_type_by_depth_type) +#define hwloc__obj_type_is_normal HWLOC_NAME(_obj_type_is_normal) +#define hwloc__obj_type_is_memory HWLOC_NAME(_obj_type_is_memory) +#define hwloc__obj_type_is_io HWLOC_NAME(_obj_type_is_io) +#define hwloc__obj_type_is_special HWLOC_NAME(_obj_type_is_special) + +#define hwloc__obj_type_is_cache HWLOC_NAME(_obj_type_is_cache) +#define hwloc__obj_type_is_dcache HWLOC_NAME(_obj_type_is_dcache) +#define hwloc__obj_type_is_icache HWLOC_NAME(_obj_type_is_icache) + +/* private/cpuid-x86.h */ + +#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid) +#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid) + +/* private/xml.h */ + +#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose) + +#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s) +#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t) +#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff) +#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s) +#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s) +#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t) +#define hwloc__xml_export_data_s HWLOC_NAME(_xml_export_data_s) +#define hwloc__xml_export_topology HWLOC_NAME(_xml_export_topology) +#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff) + +#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks) +#define hwloc_xml_component HWLOC_NAME(xml_component) +#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register) +#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset) + +#define hwloc__xml_imported_v1distances_s HWLOC_NAME(_xml_imported_v1distances_s) + +/* private/components.h */ + +#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable) +#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others) + +#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem) +#define hwloc_backends_find_callbacks HWLOC_NAME(backends_find_callbacks) + +#define hwloc_topology_components_init HWLOC_NAME(topology_components_init) +#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all) +#define hwloc_topology_components_fini HWLOC_NAME(topology_components_fini) + +#define hwloc_components_init HWLOC_NAME(components_init) +#define hwloc_components_fini HWLOC_NAME(components_fini) + +/* private/internal-private.h */ + +#define hwloc_xml_component HWLOC_NAME(xml_component) +#define hwloc_synthetic_component HWLOC_NAME(synthetic_component) + +#define hwloc_aix_component HWLOC_NAME(aix_component) +#define hwloc_bgq_component HWLOC_NAME(bgq_component) +#define hwloc_darwin_component HWLOC_NAME(darwin_component) +#define hwloc_freebsd_component HWLOC_NAME(freebsd_component) +#define hwloc_hpux_component HWLOC_NAME(hpux_component) +#define hwloc_linux_component HWLOC_NAME(linux_component) +#define hwloc_netbsd_component HWLOC_NAME(netbsd_component) +#define hwloc_noos_component HWLOC_NAME(noos_component) +#define hwloc_solaris_component HWLOC_NAME(solaris_component) +#define hwloc_windows_component HWLOC_NAME(windows_component) +#define hwloc_x86_component HWLOC_NAME(x86_component) + +#define hwloc_cuda_component HWLOC_NAME(cuda_component) +#define hwloc_gl_component HWLOC_NAME(gl_component) +#define hwloc_levelzero_component HWLOC_NAME(levelzero_component) +#define hwloc_nvml_component HWLOC_NAME(nvml_component) +#define hwloc_rsmi_component HWLOC_NAME(rsmi_component) +#define hwloc_opencl_component HWLOC_NAME(opencl_component) +#define hwloc_pci_component HWLOC_NAME(pci_component) + +#define hwloc_xml_libxml_component HWLOC_NAME(xml_libxml_component) +#define hwloc_xml_nolibxml_component HWLOC_NAME(xml_nolibxml_component) + +/* private/private.h */ + +#define hwloc_internal_location_s HWLOC_NAME(internal_location_s) + +#define hwloc_special_level_s HWLOC_NAME(special_level_s) + +#define hwloc_pci_forced_locality_s HWLOC_NAME(pci_forced_locality_s) +#define hwloc_pci_locality_s HWLOC_NAME(pci_locality_s) + +#define hwloc_topology_forced_component_s HWLOC_NAME(topology_forced_component) + +#define hwloc_alloc_root_sets HWLOC_NAME(alloc_root_sets) +#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level) +#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname) +#define hwloc_get_sysctl HWLOC_NAME(get_sysctl) +#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors) +#define hwloc_fallback_memsize HWLOC_NAME(fallback_memsize) + +#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first) +#define hwloc__reorder_children HWLOC_NAME(_reorder_children) + +#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults) +#define hwloc_topology_clear HWLOC_NAME(topology_clear) + +#define hwloc__attach_memory_object HWLOC_NAME(insert_memory_object) + +#define hwloc_get_obj_by_type_and_gp_index HWLOC_NAME(get_obj_by_type_and_gp_index) + +#define hwloc_pci_discovery_init HWLOC_NAME(pci_discovery_init) +#define hwloc_pci_discovery_prepare HWLOC_NAME(pci_discovery_prepare) +#define hwloc_pci_discovery_exit HWLOC_NAME(pci_discovery_exit) +#define hwloc_find_insert_io_parent_by_complete_cpuset HWLOC_NAME(hwloc_find_insert_io_parent_by_complete_cpuset) + +#define hwloc__add_info HWLOC_NAME(_add_info) +#define hwloc__add_info_nodup HWLOC_NAME(_add_info_nodup) +#define hwloc__move_infos HWLOC_NAME(_move_infos) +#define hwloc__free_infos HWLOC_NAME(_free_infos) +#define hwloc__tma_dup_infos HWLOC_NAME(_tma_dup_infos) + +#define hwloc_binding_hooks HWLOC_NAME(binding_hooks) +#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks) +#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks) + +#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks) +#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks) +#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks) +#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks) +#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks) +#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks) +#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks) +#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks) +#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks) + +#define hwloc_look_hardwired_fujitsu_k HWLOC_NAME(look_hardwired_fujitsu_k) +#define hwloc_look_hardwired_fujitsu_fx10 HWLOC_NAME(look_hardwired_fujitsu_fx10) +#define hwloc_look_hardwired_fujitsu_fx100 HWLOC_NAME(look_hardwired_fujitsu_fx100) + +#define hwloc_add_uname_info HWLOC_NAME(add_uname_info) +#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object) +#define hwloc_free_object_and_children HWLOC_NAME(free_object_and_children) +#define hwloc_free_object_siblings_and_children HWLOC_NAME(free_object_siblings_and_children) + +#define hwloc_alloc_heap HWLOC_NAME(alloc_heap) +#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap) +#define hwloc_free_heap HWLOC_NAME(free_heap) +#define hwloc_free_mmap HWLOC_NAME(free_mmap) +#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail) + +#define hwloc_internal_distances_s HWLOC_NAME(internal_distances_s) +#define hwloc_internal_distances_init HWLOC_NAME(internal_distances_init) +#define hwloc_internal_distances_prepare HWLOC_NAME(internal_distances_prepare) +#define hwloc_internal_distances_dup HWLOC_NAME(internal_distances_dup) +#define hwloc_internal_distances_refresh HWLOC_NAME(internal_distances_refresh) +#define hwloc_internal_distances_destroy HWLOC_NAME(internal_distances_destroy) +#define hwloc_internal_distances_add HWLOC_NAME(internal_distances_add) +#define hwloc_internal_distances_add_by_index HWLOC_NAME(internal_distances_add_by_index) +#define hwloc_internal_distances_invalidate_cached_objs HWLOC_NAME(hwloc_internal_distances_invalidate_cached_objs) + +#define hwloc_internal_memattr_s HWLOC_NAME(internal_memattr_s) +#define hwloc_internal_memattr_target_s HWLOC_NAME(internal_memattr_target_s) +#define hwloc_internal_memattr_initiator_s HWLOC_NAME(internal_memattr_initiator_s) +#define hwloc_internal_memattrs_init HWLOC_NAME(internal_memattrs_init) +#define hwloc_internal_memattrs_prepare HWLOC_NAME(internal_memattrs_prepare) +#define hwloc_internal_memattrs_dup HWLOC_NAME(internal_memattrs_dup) +#define hwloc_internal_memattrs_destroy HWLOC_NAME(internal_memattrs_destroy) +#define hwloc_internal_memattrs_need_refresh HWLOC_NAME(internal_memattrs_need_refresh) +#define hwloc_internal_memattrs_refresh HWLOC_NAME(internal_memattrs_refresh) + +#define hwloc_internal_cpukind_s HWLOC_NAME(internal_cpukind_s) +#define hwloc_internal_cpukinds_init HWLOC_NAME(internal_cpukinds_init) +#define hwloc_internal_cpukinds_destroy HWLOC_NAME(internal_cpukinds_destroy) +#define hwloc_internal_cpukinds_dup HWLOC_NAME(internal_cpukinds_dup) +#define hwloc_internal_cpukinds_register HWLOC_NAME(internal_cpukinds_register) +#define hwloc_internal_cpukinds_rank HWLOC_NAME(internal_cpukinds_rank) +#define hwloc_internal_cpukinds_restrict HWLOC_NAME(internal_cpukinds_restrict) + +#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64) +#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64) + +#define hwloc_progname HWLOC_NAME(progname) + +#define hwloc__topology_disadopt HWLOC_NAME(_topology_disadopt) +#define hwloc__topology_dup HWLOC_NAME(_topology_dup) + +#define hwloc_tma HWLOC_NAME(tma) +#define hwloc_tma_malloc HWLOC_NAME(tma_malloc) +#define hwloc_tma_calloc HWLOC_NAME(tma_calloc) +#define hwloc_tma_strdup HWLOC_NAME(tma_strdup) +#define hwloc_bitmap_tma_dup HWLOC_NAME(bitmap_tma_dup) + +/* private/solaris-chiptype.h */ + +#define hwloc_solaris_chip_info_s HWLOC_NAME(solaris_chip_info_s) +#define hwloc_solaris_get_chip_info HWLOC_NAME(solaris_get_chip_info) + +#endif /* HWLOC_SYM_TRANSFORM */ + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + + +#endif /* HWLOC_RENAME_H */ diff --git a/deps/hwloc/lib/libhwloc.a b/deps/hwloc/lib/libhwloc.a new file mode 100644 index 000000000..540a7d065 Binary files /dev/null and b/deps/hwloc/lib/libhwloc.a differ diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy new file mode 100755 index 000000000..218ff80c3 Binary files /dev/null and b/deps/mpi/bin/hydra_bstrap_proxy differ diff --git a/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver similarity index 66% rename from mpi/bin/hydra_nameserver rename to deps/mpi/bin/hydra_nameserver index 6bff3150f..028fa08df 100755 Binary files a/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy new file mode 100755 index 000000000..14efe5656 Binary files /dev/null and b/deps/mpi/bin/hydra_pmi_proxy differ diff --git a/mpi/bin/mpicc b/deps/mpi/bin/mpicc similarity index 100% rename from mpi/bin/mpicc rename to deps/mpi/bin/mpicc diff --git a/mpi/bin/mpicxx b/deps/mpi/bin/mpicxx similarity index 100% rename from mpi/bin/mpicxx rename to deps/mpi/bin/mpicxx diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec new file mode 100755 index 000000000..8826a76d3 Binary files /dev/null and b/deps/mpi/bin/mpiexec differ diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra new file mode 100755 index 000000000..8826a76d3 Binary files /dev/null and b/deps/mpi/bin/mpiexec.hydra differ diff --git a/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc similarity index 99% rename from mpi/bin/mpigcc rename to deps/mpi/bin/mpigcc index 4e0233819..9a306a10a 100755 --- a/mpi/bin/mpigcc +++ b/deps/mpi/bin/mpigcc @@ -104,7 +104,7 @@ CFLAGS="" CPPFLAGS="" LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl " LIBS="-lm -lpthread -lfabric -lrt " -MPIVERSION="2021.2" +MPIVERSION="2021.3" MPILIBNAME="mpi" diff --git a/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx similarity index 99% rename from mpi/bin/mpigxx rename to deps/mpi/bin/mpigxx index 3841ece4f..ca11d0e20 100755 --- a/mpi/bin/mpigxx +++ b/deps/mpi/bin/mpigxx @@ -101,7 +101,7 @@ MPICH_VERSION="3.3" CXXFLAGS="" LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl " LIBS="-lm -lpthread -lfabric -lrt " -MPIVERSION="2021.2" +MPIVERSION="2021.3" MPILIBNAME="mpi" MPICXXLIBNAME="mpicxx" diff --git a/mpi/bin/mpiicc b/deps/mpi/bin/mpiicc similarity index 99% rename from mpi/bin/mpiicc rename to deps/mpi/bin/mpiicc index 3922dad5b..c623722ee 100755 --- a/mpi/bin/mpiicc +++ b/deps/mpi/bin/mpiicc @@ -122,7 +122,7 @@ MPILIBNAME="mpi" PMPILIBNAME="pmpi" # MPIVERSION is the version of the MPICH2 library that mpicc is intended for -MPIVERSION="2021.2" +MPIVERSION="2021.3" # # Internal variables # Show is set to echo to cause the compilation command to be echoed instead diff --git a/mpi/bin/mpiicpc b/deps/mpi/bin/mpiicpc similarity index 99% rename from mpi/bin/mpiicpc rename to deps/mpi/bin/mpiicpc index 62667fb05..13695ab64 100755 --- a/mpi/bin/mpiicpc +++ b/deps/mpi/bin/mpiicpc @@ -121,7 +121,7 @@ PMPILIBNAME="pmpi" MPICXXLIBNAME="mpicxx" # MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for -MPIVERSION="2021.2" +MPIVERSION="2021.3" # # Internal variables # Show is set to echo to cause the compilation command to be echoed instead diff --git a/mpi/bin/mpirun b/deps/mpi/bin/mpirun similarity index 100% rename from mpi/bin/mpirun rename to deps/mpi/bin/mpirun diff --git a/mpi/etc/tuning_clx-ap_ofi.dat b/deps/mpi/etc/tuning_clx-ap_ofi.dat similarity index 100% rename from mpi/etc/tuning_clx-ap_ofi.dat rename to deps/mpi/etc/tuning_clx-ap_ofi.dat diff --git a/mpi/etc/tuning_clx-ap_shm-ofi.dat b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat similarity index 56% rename from mpi/etc/tuning_clx-ap_shm-ofi.dat rename to deps/mpi/etc/tuning_clx-ap_shm-ofi.dat index a3154cb4a..a6988c57a 100755 Binary files a/mpi/etc/tuning_clx-ap_shm-ofi.dat and b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat differ diff --git a/deps/mpi/etc/tuning_clx-ap_shm.dat b/deps/mpi/etc/tuning_clx-ap_shm.dat new file mode 100755 index 000000000..95cac35d0 Binary files /dev/null and b/deps/mpi/etc/tuning_clx-ap_shm.dat differ diff --git a/mpi/etc/tuning_generic_ofi.dat b/deps/mpi/etc/tuning_generic_ofi.dat similarity index 100% rename from mpi/etc/tuning_generic_ofi.dat rename to deps/mpi/etc/tuning_generic_ofi.dat diff --git a/mpi/etc/tuning_generic_shm-ofi.dat b/deps/mpi/etc/tuning_generic_shm-ofi.dat similarity index 100% rename from mpi/etc/tuning_generic_shm-ofi.dat rename to deps/mpi/etc/tuning_generic_shm-ofi.dat diff --git a/mpi/etc/tuning_generic_shm.dat b/deps/mpi/etc/tuning_generic_shm.dat similarity index 100% rename from mpi/etc/tuning_generic_shm.dat rename to deps/mpi/etc/tuning_generic_shm.dat diff --git a/mpi/etc/tuning_knl_ofi.dat b/deps/mpi/etc/tuning_knl_ofi.dat similarity index 100% rename from mpi/etc/tuning_knl_ofi.dat rename to deps/mpi/etc/tuning_knl_ofi.dat diff --git a/mpi/etc/tuning_knl_shm-ofi.dat b/deps/mpi/etc/tuning_knl_shm-ofi.dat similarity index 100% rename from mpi/etc/tuning_knl_shm-ofi.dat rename to deps/mpi/etc/tuning_knl_shm-ofi.dat diff --git a/mpi/etc/tuning_knl_shm.dat b/deps/mpi/etc/tuning_knl_shm.dat similarity index 100% rename from mpi/etc/tuning_knl_shm.dat rename to deps/mpi/etc/tuning_knl_shm.dat diff --git a/mpi/etc/tuning_skx_ofi.dat b/deps/mpi/etc/tuning_skx_ofi.dat similarity index 100% rename from mpi/etc/tuning_skx_ofi.dat rename to deps/mpi/etc/tuning_skx_ofi.dat diff --git a/mpi/etc/tuning_skx_shm-ofi.dat b/deps/mpi/etc/tuning_skx_shm-ofi.dat similarity index 79% rename from mpi/etc/tuning_skx_shm-ofi.dat rename to deps/mpi/etc/tuning_skx_shm-ofi.dat index 74bfe7ba1..f9e770897 100755 Binary files a/mpi/etc/tuning_skx_shm-ofi.dat and b/deps/mpi/etc/tuning_skx_shm-ofi.dat differ diff --git a/mpi/etc/tuning_skx_shm.dat b/deps/mpi/etc/tuning_skx_shm.dat similarity index 79% rename from mpi/etc/tuning_skx_shm.dat rename to deps/mpi/etc/tuning_skx_shm.dat index 74bfe7ba1..f9e770897 100755 Binary files a/mpi/etc/tuning_skx_shm.dat and b/deps/mpi/etc/tuning_skx_shm.dat differ diff --git a/mpi/include/mpi.h b/deps/mpi/include/mpi.h old mode 100755 new mode 100644 similarity index 99% rename from mpi/include/mpi.h rename to deps/mpi/include/mpi.h index 39095b742..658e5a3a5 --- a/mpi/include/mpi.h +++ b/deps/mpi/include/mpi.h @@ -1,5 +1,5 @@ /* - Copyright 2003-2021 Intel Corporation. + Copyright Intel Corporation. This software and the related documents are Intel copyrighted materials, and your use of them is governed by the express license under which they were @@ -580,8 +580,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * ); * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So, * 2019.0.0b0 will have the numeric version 20190000100. */ -#define I_MPI_VERSION "2021.2.0" -#define I_MPI_NUMVERSION 20210200300 +#define I_MPI_VERSION "2021.3.0" +#define I_MPI_NUMVERSION 20210300300 /* for the datatype decoders */ enum MPIR_Combiner_enum { diff --git a/mpi/include/mpicxx.h b/deps/mpi/include/mpicxx.h old mode 100755 new mode 100644 similarity index 99% rename from mpi/include/mpicxx.h rename to deps/mpi/include/mpicxx.h index 3d27a661b..07c4ebce3 --- a/mpi/include/mpicxx.h +++ b/deps/mpi/include/mpicxx.h @@ -1,5 +1,5 @@ /* - Copyright 2003-2021 Intel Corporation. + Copyright Intel Corporation. This software and the related documents are Intel copyrighted materials, and your use of them is governed by the express license under which they were diff --git a/mpi/include/mpio.h b/deps/mpi/include/mpio.h old mode 100755 new mode 100644 similarity index 99% rename from mpi/include/mpio.h rename to deps/mpi/include/mpio.h index 74ce84e70..2e35d8913 --- a/mpi/include/mpio.h +++ b/deps/mpi/include/mpio.h @@ -1,5 +1,5 @@ /* - Copyright 2003-2021 Intel Corporation. + Copyright Intel Corporation. This software and the related documents are Intel copyrighted materials, and your use of them is governed by the express license under which they were diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so new file mode 100755 index 000000000..d7243ada7 Binary files /dev/null and b/deps/mpi/lib/libmpi.so differ diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12 new file mode 100755 index 000000000..d7243ada7 Binary files /dev/null and b/deps/mpi/lib/libmpi.so.12 differ diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0 new file mode 100755 index 000000000..d7243ada7 Binary files /dev/null and b/deps/mpi/lib/libmpi.so.12.0 differ diff --git a/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0 similarity index 62% rename from mpi/lib/libmpi.so.12.0.0 rename to deps/mpi/lib/libmpi.so.12.0.0 index d391200c7..d7243ada7 100755 Binary files a/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ diff --git a/deps/mpi/lib/libmpicxx.so b/deps/mpi/lib/libmpicxx.so new file mode 100755 index 000000000..aeeba2cb7 Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so differ diff --git a/deps/mpi/lib/libmpicxx.so.12 b/deps/mpi/lib/libmpicxx.so.12 new file mode 100755 index 000000000..aeeba2cb7 Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so.12 differ diff --git a/deps/mpi/lib/libmpicxx.so.12.0 b/deps/mpi/lib/libmpicxx.so.12.0 new file mode 100755 index 000000000..aeeba2cb7 Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so.12.0 differ diff --git a/mpi/lib/libmpicxx.so.12.0.0 b/deps/mpi/lib/libmpicxx.so.12.0.0 similarity index 99% rename from mpi/lib/libmpicxx.so.12.0.0 rename to deps/mpi/lib/libmpicxx.so.12.0.0 index ee69659ef..aeeba2cb7 100755 Binary files a/mpi/lib/libmpicxx.so.12.0.0 and b/deps/mpi/lib/libmpicxx.so.12.0.0 differ diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so new file mode 100755 index 000000000..f67aaad45 Binary files /dev/null and b/deps/mpi/lib/libmpifort.so differ diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12 new file mode 100755 index 000000000..f67aaad45 Binary files /dev/null and b/deps/mpi/lib/libmpifort.so.12 differ diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0 new file mode 100755 index 000000000..f67aaad45 Binary files /dev/null and b/deps/mpi/lib/libmpifort.so.12.0 differ diff --git a/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0 similarity index 61% rename from mpi/lib/libmpifort.so.12.0.0 rename to deps/mpi/lib/libmpifort.so.12.0.0 index 6cc0e68cb..f67aaad45 100755 Binary files a/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ diff --git a/mpi/licensing/license.txt b/deps/mpi/licensing/license.txt old mode 100755 new mode 100644 similarity index 100% rename from mpi/licensing/license.txt rename to deps/mpi/licensing/license.txt diff --git a/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt old mode 100755 new mode 100644 similarity index 99% rename from mpi/licensing/third-party-programs.txt rename to deps/mpi/licensing/third-party-programs.txt index bd2a5e95b..307780de4 --- a/mpi/licensing/third-party-programs.txt +++ b/deps/mpi/licensing/third-party-programs.txt @@ -1,4 +1,4 @@ -Intel(R) MPI Library 2021.2 Third Party Programs File +Intel(R) MPI Library 2021.3 Third Party Programs File This file is the "third-party-programs.txt" file specified in the associated Intel end user license agreement for the Intel software you are licensing. diff --git a/deps/ofi/bin/fi_info b/deps/ofi/bin/fi_info new file mode 100755 index 000000000..347648e8a Binary files /dev/null and b/deps/ofi/bin/fi_info differ diff --git a/ofi/include/rdma/fabric.h b/deps/ofi/include/rdma/fabric.h similarity index 95% rename from ofi/include/rdma/fabric.h rename to deps/ofi/include/rdma/fabric.h index 79e80e164..71628035e 100644 --- a/ofi/include/rdma/fabric.h +++ b/deps/ofi/include/rdma/fabric.h @@ -79,8 +79,8 @@ extern "C" { #endif #define FI_MAJOR_VERSION 1 -#define FI_MINOR_VERSION 11 -#define FI_REVISION_VERSION 0 +#define FI_MINOR_VERSION 12 +#define FI_REVISION_VERSION 1 enum { FI_PATH_MAX = 256, @@ -208,6 +208,7 @@ enum { FI_ADDR_PSMX2, /* uint64_t[2] */ FI_ADDR_IB_UD, /* uint64_t[4] */ FI_ADDR_EFA, + FI_ADDR_PSMX3, /* uint64_t[2] */ }; #define FI_ADDR_UNSPEC ((uint64_t) -1) @@ -319,7 +320,8 @@ enum { FI_PROTO_MRAIL, FI_PROTO_RSTREAM, FI_PROTO_RDMA_CM_IB_XRC, - FI_PROTO_EFA + FI_PROTO_EFA, + FI_PROTO_PSMX3 }; enum { @@ -598,6 +600,11 @@ struct fi_alias { uint64_t flags; }; +struct fi_fid_var { + int name; + void *val; +}; + struct fi_mr_raw_attr { uint64_t flags; uint64_t *base_addr; @@ -632,6 +639,8 @@ enum { FI_REFRESH, /* mr: fi_mr_modify */ FI_DUP, /* struct fid ** */ FI_GETWAITOBJ, /*enum fi_wait_obj * */ + FI_GET_VAL, /* struct fi_fid_var */ + FI_SET_VAL, /* struct fi_fid_var */ }; static inline int fi_control(struct fid *fid, int command, void *arg) @@ -647,6 +656,28 @@ static inline int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t fla return fi_control(fid, FI_ALIAS, &alias); } +/* fid value names */ +/* + * Currently no common name is defined. Provider specific names should + * have the FI_PROV_SPECIFIC bit set. + */ + +static inline int fi_get_val(struct fid *fid, int name, void *val) +{ + struct fi_fid_var var; + var.name = name; + var.val = val; + return fi_control(fid, FI_GET_VAL, &var); +} + +static inline int fi_set_val(struct fid *fid, int name, void *val) +{ + struct fi_fid_var var; + var.name = name; + var.val = val; + return fi_control(fid, FI_SET_VAL, &var); +} + static inline int fi_open_ops(struct fid *fid, const char *name, uint64_t flags, void **ops, void *context) @@ -692,6 +723,8 @@ enum fi_type { }; char *fi_tostr(const void *data, enum fi_type datatype); +char *fi_tostr_r(char *buf, size_t len, const void *data, + enum fi_type datatype); enum fi_param_type { FI_PARAM_STRING, diff --git a/ofi/include/rdma/fi_cm.h b/deps/ofi/include/rdma/fi_cm.h similarity index 100% rename from ofi/include/rdma/fi_cm.h rename to deps/ofi/include/rdma/fi_cm.h diff --git a/ofi/include/rdma/fi_domain.h b/deps/ofi/include/rdma/fi_domain.h similarity index 100% rename from ofi/include/rdma/fi_domain.h rename to deps/ofi/include/rdma/fi_domain.h diff --git a/ofi/include/rdma/fi_endpoint.h b/deps/ofi/include/rdma/fi_endpoint.h similarity index 100% rename from ofi/include/rdma/fi_endpoint.h rename to deps/ofi/include/rdma/fi_endpoint.h diff --git a/ofi/include/rdma/fi_eq.h b/deps/ofi/include/rdma/fi_eq.h similarity index 100% rename from ofi/include/rdma/fi_eq.h rename to deps/ofi/include/rdma/fi_eq.h diff --git a/ofi/include/rdma/fi_errno.h b/deps/ofi/include/rdma/fi_errno.h similarity index 100% rename from ofi/include/rdma/fi_errno.h rename to deps/ofi/include/rdma/fi_errno.h diff --git a/ofi/include/rdma/fi_rma.h b/deps/ofi/include/rdma/fi_rma.h similarity index 100% rename from ofi/include/rdma/fi_rma.h rename to deps/ofi/include/rdma/fi_rma.h diff --git a/ofi/include/rdma/fi_tagged.h b/deps/ofi/include/rdma/fi_tagged.h similarity index 100% rename from ofi/include/rdma/fi_tagged.h rename to deps/ofi/include/rdma/fi_tagged.h diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so new file mode 100755 index 000000000..35c21dfc3 Binary files /dev/null and b/deps/ofi/lib/libfabric.so differ diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1 new file mode 100755 index 000000000..35c21dfc3 Binary files /dev/null and b/deps/ofi/lib/libfabric.so.1 differ diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so new file mode 100755 index 000000000..47830166d Binary files /dev/null and b/deps/ofi/lib/prov/libpsm3-fi.so differ diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so new file mode 100755 index 000000000..375463c58 Binary files /dev/null and b/deps/ofi/lib/prov/libpsmx2-fi.so differ diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so new file mode 100755 index 000000000..83af28e2e Binary files /dev/null and b/deps/ofi/lib/prov/librxm-fi.so differ diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so new file mode 100755 index 000000000..dfce33131 Binary files /dev/null and b/deps/ofi/lib/prov/libshm-fi.so differ diff --git a/deps/ofi/lib/prov/libsockets-fi.so b/deps/ofi/lib/prov/libsockets-fi.so new file mode 100755 index 000000000..b164233e5 Binary files /dev/null and b/deps/ofi/lib/prov/libsockets-fi.so differ diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so new file mode 100755 index 000000000..10f430bc1 Binary files /dev/null and b/deps/ofi/lib/prov/libtcp-fi.so differ diff --git a/deps/ofi/lib/prov/libverbs-fi.so b/deps/ofi/lib/prov/libverbs-fi.so new file mode 100755 index 000000000..2a895fbd5 Binary files /dev/null and b/deps/ofi/lib/prov/libverbs-fi.so differ diff --git a/doc/requirements.txt b/doc/requirements.txt index e82b1e325..5c84e63d8 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -41,7 +41,8 @@ requests==2.22.0 six==1.12.0 snowballstemmer==1.9.0 soupsieve==1.9.2 -Sphinx==2.1.2 +Sphinx==3.5.4 +sphinx-book-theme==0.1.0 sphinx-rtd-theme==0.4.3 sphinxcontrib-applehelp==1.0.1 sphinxcontrib-devhelp==1.0.1 diff --git a/doc/rst/Readme.txt b/doc/rst/Readme.txt old mode 100755 new mode 100644 diff --git a/doc/rst/source/_static/favicons.png b/doc/rst/source/_static/favicons.png new file mode 100644 index 000000000..f450376b1 Binary files /dev/null and b/doc/rst/source/_static/favicons.png differ diff --git a/doc/rst/source/_static/oneAPI-rgb-rev-100.png b/doc/rst/source/_static/oneAPI-rgb-rev-100.png new file mode 100644 index 000000000..58d2d5c54 Binary files /dev/null and b/doc/rst/source/_static/oneAPI-rgb-rev-100.png differ diff --git a/doc/rst/source/_static/style.css b/doc/rst/source/_static/style.css old mode 100755 new mode 100644 diff --git a/doc/rst/source/_templates/layout.html b/doc/rst/source/_templates/layout.html old mode 100755 new mode 100644 diff --git a/doc/rst/source/api/operations/collective-operations/alltoallv.rst b/doc/rst/source/api/operations/collective-operations/alltoallv.rst index 9161a6cd9..0aae683d5 100644 --- a/doc/rst/source/api/operations/collective-operations/alltoallv.rst +++ b/doc/rst/source/api/operations/collective-operations/alltoallv.rst @@ -1,7 +1,7 @@ Alltoallv ********* -.. doxygengroup:: alltoall +.. doxygengroup:: alltoallv :project: oneccl :content-only: - :no-link: \ No newline at end of file + :no-link: diff --git a/doc/rst/source/conf.py b/doc/rst/source/conf.py index dae20b4ad..4c0fbd89f 100755 --- a/doc/rst/source/conf.py +++ b/doc/rst/source/conf.py @@ -17,12 +17,12 @@ # -- Project information ----------------------------------------------------- -project = 'oneCCL Documentation' -copyright = '2019–2020' +project = 'oneCCL' +copyright = '2019–2021' author = 'Intel' # The full version, including alpha/beta/rc tags -release = '2021' +# release = '2021' rst_prolog = """ .. |product_full| replace:: Intel\ |reg|\ oneAPI Collective Communications Library @@ -105,19 +105,15 @@ # Tell sphinx what the pygments highlight language should be. highlight_language = 'cpp' -import sphinx_rtd_theme -html_theme = 'sphinx_rtd_theme' -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -if on_rtd: - using_rtd_theme = True +html_theme = 'sphinx_book_theme' +html_logo = '_static/oneAPI-rgb-rev-100.png' +html_favicon = '_static/favicons.png' # Theme options html_theme_options = { - # 'typekit_id': 'hiw1hhg', - # 'analytics_id': '', - # 'sticky_navigation': True # Set to False to disable the sticky nav while scrolling. - 'logo_only': True, # if we have a html_logo below, this shows /only/ the logo with no title text - 'collapse_navigation': False, # Collapse navigation (False makes it tree-like) - # 'display_version': True, # Display the docs version - # 'navigation_depth': 4, # Depth of the headers shown in the navigation bar + 'repository_url': 'https://github.com/oneapi-src/oneCCL', + 'path_to_docs': 'doc/source', + 'use_issues_button': True, + 'use_edit_page_button': True, + 'repository_branch': 'master' } diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst index 8f83d9f44..988554c9f 100644 --- a/doc/rst/source/env-variables.rst +++ b/doc/rst/source/env-variables.rst @@ -556,3 +556,60 @@ CCL_MAX_SHORT_SIZE **Description** Set this environment variable to specify the threshold of the number of bytes for a collective operation to be split. + + +CCL_MNIC +######## +**Syntax** + +:: + + CCL_MNIC=<value> + +**Arguments** + +.. list-table:: + :widths: 25 50 + :header-rows: 1 + :align: left + + * - <value> + - Description + * - ``global`` + - Select all NICs available on the node. + * - ``local`` + - Select all NICs local for the NUMA node that corresponds to process pinning. + * - ``none`` + - Disable special NIC selection, use a single default NIC (**default**). + +**Description** + +Set this environment variable to control multi-NIC selection policy. +|product_short| workers will be pinned on selected NICs in a round-robin way. + + +CCL_MNIC_COUNT +############## +**Syntax** + +:: + + CCL_MNIC_COUNT=<value> + +**Arguments** + +.. list-table:: + :widths: 25 50 + :header-rows: 1 + :align: left + + * - <value> + - Description + * - ``N`` + - The maximum number of NICs that should be selected for |product_short| workers. + If not specified then equal to the number of |product_short| workers. + +**Description** + +Set this environment variable to specify the maximum number of NICs to be selected. +The actual number of NICs selected may be smaller due to limitations on transport level or system configuration. diff --git a/doc/rst/source/index.rst b/doc/rst/source/index.rst old mode 100755 new mode 100644 diff --git a/doc/rst/source/legal.rst b/doc/rst/source/legal.rst old mode 100755 new mode 100644 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt old mode 100755 new mode 100644 index f5922c224..499d34283 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,6 +17,9 @@ cmake_minimum_required (VERSION 2.8) if (DEFINED ENV{CCL_CONFIGURATION}) set(CCL_CONFIGURATION "$ENV{CCL_CONFIGURATION}") + if(${CCL_CONFIGURATION} STREQUAL "cpu_gpu_dpcpp") + set(COMPUTE_BACKEND_TARGET_NAME "sycl") + endif() endif() if (DEFINED ENV{CCL_ROOT}) @@ -25,6 +28,11 @@ else() message(FATAL_ERROR "Please define CCL_ROOT environment variable") endif() +if (DEFINED ENV{I_MPI_ROOT}) + set(I_MPI_ROOT "$ENV{I_MPI_ROOT}") + set(CMAKE_INSTALL_RPATH "${I_MPI_ROOT}/lib/release_mt/") +endif() + message(STATUS "CCL_ROOT: ${CCL_ROOT}") message(STATUS "CCL_CONFIGURATION: ${CCL_CONFIGURATION}") @@ -58,11 +66,13 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMP set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) -if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") +if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR(${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")) set(CMAKE_CLANG_FLAGS "-fsycl") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -lsycl") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CLANG_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_FLAGS}") + # Use c++17 to be aligned with the compiler + set(CMAKE_CXX_STANDARD 17) endif() set(GCC_BF16_MIN_SUPPORTED "4.9.0") @@ -85,10 +95,6 @@ if (CCL_BF16_COMPILER) endif() endif() - -include_directories(${CCL_ROOT}/include/${CCL_CONFIGURATION}) -link_directories(${CCL_ROOT}/lib/${CCL_CONFIGURATION}) - include_directories(include) add_subdirectory(cpu) diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 24ce78b4d..41879eb9e 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -23,16 +23,24 @@ endif() include_directories(include) include_directories(src) +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") +find_package(NUMA) + foreach(src ${sources}) get_filename_component(executable ${src} NAME_WE) add_executable(${executable} ${src}) + if (NUMA_FOUND) + target_include_directories(${executable} PRIVATE ${NUMA_INCLUDE_DIR}) + target_link_libraries(${executable} PRIVATE numa) + target_compile_definitions(${executable} PRIVATE CCL_ENABLE_NUMA) + endif() target_include_directories(${executable} PRIVATE ${EXAMPLES_INC_DIRS}) target_link_libraries(${executable} PRIVATE ccl) target_link_libraries(${executable} PUBLIC pthread) target_link_libraries(${executable} PUBLIC rt) target_link_libraries(${executable} PUBLIC m) target_link_libraries(${executable} PUBLIC dl) - target_link_libraries(${executable} PRIVATE m) + target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/) target_link_libraries(${executable} PUBLIC mpi) install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark OPTIONAL) endforeach() diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp index 2ab6646ce..d7c624d5c 100644 --- a/examples/benchmark/include/benchmark.hpp +++ b/examples/benchmark/include/benchmark.hpp @@ -19,18 +19,19 @@ #include <chrono> #include <cstring> #include <getopt.h> +#include <fstream> #include <functional> #include <iostream> #include <iterator> +#include <iomanip> #include <numeric> #include <map> -#include <math.h> +#include <cmath> #include <numeric> #include <stdexcept> -#include <stdio.h> +#include <cstdio> #include <sys/time.h> #include <vector> -#include <fstream> #ifdef CCL_ENABLE_SYCL #include <CL/sycl.hpp> @@ -44,6 +45,7 @@ using namespace cl::sycl::access; #include "coll.hpp" #include "sparse_allreduce/sparse_detail.hpp" +/* free letters: f g v z */ void print_help_usage(const char* app) { PRINT("\nUSAGE:\n" "\t%s [OPTIONS]\n\n" @@ -52,30 +54,36 @@ void print_help_usage(const char* app) { "\t[-e,--loop <execution loop>]: %s\n" "\t[-i,--iters <iteration count>]: %d\n" "\t[-w,--warmup_iters <warm up iteration count>]: %d\n" + "\t[-j,--iter_policy <iteration policy>]: %s\n" "\t[-n,--buf_count <number of parallel operations within single collective>]: %d\n" "\t[-f,--min_elem_count <minimum number of elements for single collective>]: %d\n" "\t[-t,--max_elem_count <maximum number of elements for single collective>]: %d\n" "\t[-y,--elem_counts <list of element counts for single collective>]: [%d-%d]\n" "\t[-c,--check <check result correctness>]: %d\n" "\t[-p,--cache <use persistent operations>]: %d\n" + "\t[-q,--inplace <use same buffer as send and recv buffer>]: %d\n" + "\t[-k,--ranks_per_proc <number of ranks per process>]: %d\n" +#ifdef CCL_ENABLE_NUMA + "\t[-s,--numa_node <numa node for allocation of send and recv buffers>]: %s\n" +#endif /* CCL_ENABLE_NUMA */ #ifdef CCL_ENABLE_SYCL "\t[-a,--sycl_dev_type <sycl device type>]: %s\n" "\t[-m,--sycl_mem_type <sycl memory type>]: %s\n" "\t[-u,--sycl_usm_type <sycl usm type>]: %s\n" -#endif - "\t[-k,--ranks_per_proc <number of ranks per process>]: %d\n" +#endif /* CCL_ENABLE_SYCL */ "\t[-l,--coll <collectives list/all>]: %s\n" "\t[-d,--dtype <datatypes list/all>]: %s\n" "\t[-r,--reduction <reductions list/all>]: %s\n" "\t[-o,--csv_filepath <file to store CSV-formatted data into>]: %s\n" + "\t[-x,--ext <show additional information>]\n" "\t[-h,--help]\n\n" - "example:\n\t--coll allgatherv,allreduce --backend host --loop regular\n" - "example:\n\t--coll bcast,reduce --backend sycl --loop unordered \n", + "example:\n\t--coll allgatherv,allreduce --backend host --elem_counts 64,1024\n", app, backend_names[DEFAULT_BACKEND].c_str(), loop_names[DEFAULT_LOOP].c_str(), DEFAULT_ITERS, DEFAULT_WARMUP_ITERS, + iter_policy_names[DEFAULT_ITER_POLICY].c_str(), DEFAULT_BUF_COUNT, DEFAULT_MIN_ELEM_COUNT, DEFAULT_MAX_ELEM_COUNT, @@ -83,12 +91,16 @@ void print_help_usage(const char* app) { DEFAULT_MAX_ELEM_COUNT, DEFAULT_CHECK_VALUES, DEFAULT_CACHE_OPS, + DEFAULT_INPLACE, + DEFAULT_RANKS_PER_PROC, +#ifdef CCL_ENABLE_NUMA + DEFAULT_NUMA_NODE_STR, +#endif /* CCL_ENABLE_NUMA */ #ifdef CCL_ENABLE_SYCL sycl_dev_names[DEFAULT_SYCL_DEV_TYPE].c_str(), sycl_mem_names[DEFAULT_SYCL_MEM_TYPE].c_str(), sycl_usm_names[DEFAULT_SYCL_USM_TYPE].c_str(), -#endif - DEFAULT_RANKS_PER_PROC, +#endif /* CCL_ENABLE_SYCL */ DEFAULT_COLL_LIST, DEFAULT_DTYPES_LIST, DEFAULT_REDUCTIONS_LIST, @@ -166,6 +178,20 @@ int set_loop(const std::string& option_value, loop_type_t& loop) { return 0; } +int set_iter_policy(const std::string& option_value, iter_policy_t& policy) { + std::string option_name = "iter_policy"; + std::set<std::string> supported_option_values{ iter_policy_names[ITER_POLICY_OFF], + iter_policy_names[ITER_POLICY_AUTO] }; + + if (check_supported_options(option_name, option_value, supported_option_values)) + return -1; + + policy = + (option_value == iter_policy_names[ITER_POLICY_OFF]) ? ITER_POLICY_OFF : ITER_POLICY_AUTO; + + return 0; +} + #ifdef CCL_ENABLE_SYCL int set_sycl_dev_type(const std::string& option_value, sycl_dev_type_t& dev) { std::string option_name = "sycl_dev_type"; @@ -241,11 +267,8 @@ int set_datatypes(std::string option_value, int check_values, std::list<std::str if ((dt == dtype_names[ccl::datatype::float16] || dt == dtype_names[ccl::datatype::bfloat16]) && check_values) { - PRINT( - "correctness checking is not implemented for '%s', try to disable checking with '-c 0' option", - dt.c_str()); + PRINT("WARN: correctness checking is not implemented for '%s'", dt.c_str()); } - return -1; } } } @@ -277,23 +300,27 @@ int set_reductions(std::string option_value, int check_values, std::list<std::st for (auto r : reductions) { if (check_supported_options(option_name, r, supported_option_values)) { if ((r != reduction_names[ccl::reduction::sum]) && check_values) { - PRINT( - "correctness checking is not implemented for '%s', try to disable checking with '-c 0' option", - r.c_str()); + PRINT("WARN: correctness checking is not implemented for '%s'", r.c_str()); } - return -1; } } } return 0; } -size_t get_iter_count(size_t bytes, size_t max_iter_count) { +size_t get_iter_count(size_t bytes, size_t max_iter_count, iter_policy_t policy) { size_t n, res = max_iter_count; - n = bytes >> 18; - while (n) { - res >>= 1; - n >>= 1; + + switch (policy) { + case ITER_POLICY_OFF: break; + case ITER_POLICY_AUTO: + n = bytes >> 18; + while (n) { + res >>= 1; + n >>= 1; + } + break; + default: ASSERT(0, "unknown iter_policy %d", policy); break; } if (!res && max_iter_count) @@ -302,92 +329,136 @@ size_t get_iter_count(size_t bytes, size_t max_iter_count) { return res; } +void store_to_csv(const user_options_t& options, + size_t nranks, + size_t elem_count, + size_t iter_count, + ccl::datatype dtype, + ccl::reduction op, + double min_time, + double max_time, + double avg_time, + double stddev, + double wait_avg_time) { + std::ofstream csvf; + csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app); + + if (csvf.is_open()) { + const size_t buf_count = options.buf_count; + + for (const auto& cop : options.coll_names) { + auto get_op_name = [&]() { + if (cop == "allreduce" || cop == "reduce_scatter" || cop == "reduce") { + return reduction_names.at(op); + } + return std::string{}; + }; + + csvf << nranks << "," << cop << "," << get_op_name() << "," << dtype_names.at(dtype) + << "," << ccl::get_datatype_size(dtype) << "," << elem_count << "," << buf_count + << "," << iter_count << "," << min_time << "," << max_time << "," << avg_time + << "," << stddev << "," << wait_avg_time << std::endl; + } + csvf.close(); + } +} + /* timer array contains one number per collective, one collective corresponds to ranks_per_proc */ -void print_timings(ccl::communicator& comm, - const std::vector<double>& local_timers, +void print_timings(const ccl::communicator& comm, + const std::vector<double>& local_total_timers, + const std::vector<double>& local_wait_timers, const user_options_t& options, - const size_t elem_count, - const size_t iter_count, + size_t elem_count, + size_t iter_count, ccl::datatype dtype, ccl::reduction op) { const size_t buf_count = options.buf_count; const size_t ncolls = options.coll_names.size(); - std::vector<double> all_timers(ncolls * comm.size()); - std::vector<size_t> recv_counts(comm.size()); + const size_t nranks = comm.size(); + + // get timers from other ranks + std::vector<double> all_ranks_total_timers(ncolls * nranks); + std::vector<double> all_ranks_wait_timers(ncolls * nranks); + std::vector<size_t> recv_counts(nranks, ncolls); - int idx; - for (idx = 0; idx < comm.size(); idx++) - recv_counts[idx] = ncolls; + std::vector<ccl::event> events; + events.push_back(ccl::allgatherv( + local_total_timers.data(), ncolls, all_ranks_total_timers.data(), recv_counts, comm)); + events.push_back(ccl::allgatherv( + local_wait_timers.data(), ncolls, all_ranks_wait_timers.data(), recv_counts, comm)); - ccl::allgatherv(local_timers.data(), ncolls, all_timers.data(), recv_counts, comm).wait(); + for (ccl::event& ev : events) { + ev.wait(); + } if (comm.rank() == 0) { - std::vector<double> timers(comm.size(), 0); - for (int r = 0; r < comm.size(); ++r) { - for (size_t c = 0; c < ncolls; ++c) { - timers[r] += all_timers[r * ncolls + c]; + std::vector<double> total_timers(nranks, 0); + std::vector<double> wait_timers(nranks, 0); + std::vector<double> min_timers(ncolls, 0); + std::vector<double> max_timers(ncolls, 0); + + // parse timers from all ranks + for (size_t rank_idx = 0; rank_idx < nranks; ++rank_idx) { + for (size_t coll_idx = 0; coll_idx < ncolls; ++coll_idx) { + double total_time = all_ranks_total_timers.at(rank_idx * ncolls + coll_idx); + double wait_time = all_ranks_wait_timers.at(rank_idx * ncolls + coll_idx); + total_timers.at(rank_idx) += total_time; + wait_timers.at(rank_idx) += wait_time; + + double& min = min_timers.at(coll_idx); + min = (min != 0) ? std::min(min, total_time) : total_time; + + double& max = max_timers.at(coll_idx); + max = std::max(max, total_time); } } - double avg_timer(0); - double avg_timer_per_buf(0); - for (idx = 0; idx < comm.size(); idx++) { - avg_timer += timers[idx]; - } - avg_timer /= (iter_count * comm.size()); - avg_timer_per_buf = avg_timer / buf_count; + double total_avg_time = std::accumulate(total_timers.begin(), total_timers.end(), 0); + total_avg_time /= iter_count * nranks; - double stddev_timer = 0; - double sum = 0; - for (idx = 0; idx < comm.size(); idx++) { - double val = timers[idx] / iter_count; - sum += (val - avg_timer) * (val - avg_timer); - } + double wait_avg_time = std::accumulate(wait_timers.begin(), wait_timers.end(), 0); + wait_avg_time /= iter_count * nranks; - stddev_timer = sqrt(sum / comm.size()) / avg_timer * 100; - if (buf_count == 1) { - printf("%10zu %12.1lf %11.1lf\n", - elem_count * ccl::get_datatype_size(dtype) * buf_count, - avg_timer, - stddev_timer); - } - else { - printf("%10zu %13.1lf %18.1lf %11.1lf\n", - elem_count * ccl::get_datatype_size(dtype) * buf_count, - avg_timer, - avg_timer_per_buf, - stddev_timer); + double sum = 0; + for (const double& timer : total_timers) { + double latency = (double)timer / iter_count; + sum += (latency - total_avg_time) * (latency - total_avg_time); } + double stddev = std::sqrt((double)sum / nranks) / total_avg_time * 100; - // in case csv export is requested - // we write one line per collop, dtype and reduction - // hence average is per collop, not the aggregate over all - if (!options.csv_filepath.empty()) { - std::ofstream csvf; - csvf.open(options.csv_filepath, std::ios::app); + double min_time = std::accumulate(min_timers.begin(), min_timers.end(), 0); + min_time /= iter_count; - if (csvf.is_open()) { - std::vector<double> avg_timer(ncolls, 0); + double max_time = std::accumulate(max_timers.begin(), max_timers.end(), 0); + max_time /= iter_count; - for (int r = 0; r < comm.size(); ++r) { - for (size_t c = 0; c < ncolls; ++c) { - avg_timer[c] += all_timers[r * ncolls + c]; - } - } + size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count; + std::stringstream ss; + ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH) + << iter_count << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << min_time + << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << max_time + << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << total_avg_time + << std::setw(COL_WIDTH - 3) << std::setprecision(COL_PRECISION) << stddev + << std::setw(COL_WIDTH + 3); - for (size_t c = 0; c < ncolls; ++c) { - avg_timer[c] /= (iter_count * comm.size()); - } + if (options.show_additional_info) { + ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time; + } + ss << std::endl; + printf("%s", ss.str().c_str()); - int i = 0; - for (auto cop = options.coll_names.begin(); cop != options.coll_names.end(); - ++cop, ++i) { - csvf << comm.size() << "," << (*cop) << "," << reduction_names[op] << "," - << dtype_names[dtype] << "," << ccl::get_datatype_size(dtype) << "," - << elem_count << "," << buf_count << "," << avg_timer[i] << std::endl; - } - csvf.close(); - } + if (!options.csv_filepath.empty()) { + store_to_csv(options, + nranks, + elem_count, + iter_count, + dtype, + op, + min_time, + max_time, + total_avg_time, + stddev, + wait_avg_time); } } @@ -454,39 +525,53 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { bool should_parse_datatypes = false; bool should_parse_reductions = false; + char short_options[1024] = { 0 }; + + const char* base_options = "b:e:i:w:j:n:f:t:c:p:q:o:k:s:l:d:r:y:xh"; + memcpy(short_options, base_options, strlen(base_options)); + +#ifdef CCL_ENABLE_NUMA + const char* numa_options = "s:"; + memcpy(short_options + strlen(short_options), numa_options, strlen(numa_options)); +#endif /* CCL_ENABLE_NUMA */ + #ifdef CCL_ENABLE_SYCL - const char* const short_options = "b:e:i:w:n:f:t:c:p:o:a:m:u:k:l:d:r:y:h"; -#else - const char* const short_options = "b:e:i:w:n:f:t:c:p:o:k:l:d:r:y:h"; -#endif + const char* sycl_options = "a:m:u:"; + memcpy(short_options + strlen(short_options), sycl_options, strlen(sycl_options)); +#endif /* CCL_ENABLE_SYCL */ struct option getopt_options[] = { - { "backend", required_argument, 0, 'b' }, - { "loop", required_argument, 0, 'e' }, - { "iters", required_argument, 0, 'i' }, - { "warmup_iters", required_argument, 0, 'w' }, - { "buf_count", required_argument, 0, 'n' }, - { "min_elem_count", required_argument, 0, 'f' }, - { "max_elem_count", required_argument, 0, 't' }, - { "elem_counts", required_argument, 0, 'y' }, - { "check", required_argument, 0, 'c' }, - { "cache", required_argument, 0, 'p' }, - /*{ "v2i_ratio", required_argument, 0, 'v' },*/ + { "backend", required_argument, nullptr, 'b' }, + { "loop", required_argument, nullptr, 'e' }, + { "iters", required_argument, nullptr, 'i' }, + { "warmup_iters", required_argument, nullptr, 'w' }, + { "iter_policy", required_argument, nullptr, 'j' }, + { "buf_count", required_argument, nullptr, 'n' }, + { "min_elem_count", required_argument, nullptr, 'f' }, + { "max_elem_count", required_argument, nullptr, 't' }, + { "elem_counts", required_argument, nullptr, 'y' }, + { "check", required_argument, nullptr, 'c' }, + { "cache", required_argument, nullptr, 'p' }, + { "inplace", required_argument, nullptr, 'q' }, + { "ranks_per_proc", required_argument, nullptr, 'k' }, +#ifdef CCL_ENABLE_NUMA + { "numa_node", required_argument, nullptr, 's' }, +#endif /* CCL_ENABLE_NUMA */ #ifdef CCL_ENABLE_SYCL - { "sycl_dev_type", required_argument, 0, 'a' }, - { "sycl_mem_type", required_argument, 0, 'm' }, - { "sycl_usm_type", required_argument, 0, 'u' }, -#endif - { "ranks", required_argument, 0, 'k' }, - { "coll", required_argument, 0, 'l' }, - { "dtype", required_argument, 0, 'd' }, - { "reduction", required_argument, 0, 'r' }, - { "csv_filepath", required_argument, 0, 'o' }, - { "help", no_argument, 0, 'h' }, - { 0, 0, 0, 0 } // required at end of array. + { "sycl_dev_type", required_argument, nullptr, 'a' }, + { "sycl_mem_type", required_argument, nullptr, 'm' }, + { "sycl_usm_type", required_argument, nullptr, 'u' }, +#endif /* CCL_ENABLE_SYCL */ + { "coll", required_argument, nullptr, 'l' }, + { "dtype", required_argument, nullptr, 'd' }, + { "reduction", required_argument, nullptr, 'r' }, + { "csv_filepath", required_argument, nullptr, 'o' }, + { "ext", no_argument, nullptr, 'x' }, + { "help", no_argument, nullptr, 'h' }, + { nullptr, 0, nullptr, 0 } // required at end of array. }; - while ((ch = getopt_long(argc, argv, short_options, getopt_options, NULL)) != -1) { + while ((ch = getopt_long(argc, argv, short_options, getopt_options, nullptr)) != -1) { switch (ch) { case 'b': if (set_backend(optarg, options.backend)) { @@ -514,6 +599,12 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { else errors++; break; + case 'j': + if (set_iter_policy(optarg, options.iter_policy)) { + PRINT("failed to parse 'iter_policy' option"); + errors++; + } + break; case 'n': if (is_valid_integer_option(optarg)) { options.buf_count = atoll(optarg); @@ -549,10 +640,22 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { errors++; break; case 'c': options.check_values = atoi(optarg); break; - case 'p': - options.cache_ops = atoi(optarg); + case 'p': options.cache_ops = atoi(optarg); break; + case 'q': options.inplace = atoi(optarg); break; + case 'k': + if (is_valid_integer_option(optarg)) { + options.ranks_per_proc = atoll(optarg); + } + else + errors++; + break; + case 's': + if (is_valid_integer_option(optarg)) { + options.numa_node = atoll(optarg); + } + else + errors++; break; - /*case 'v': options.v2i_ratio = atoll(optarg); break;*/ #ifdef CCL_ENABLE_SYCL case 'a': if (set_sycl_dev_type(optarg, options.sycl_dev_type)) { @@ -572,14 +675,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { errors++; } break; -#endif - case 'k': - if (is_valid_integer_option(optarg)) { - options.ranks_per_proc = atoll(optarg); - } - else - errors++; - break; +#endif /* CCL_ENABLE_SYCL */ case 'l': if (strcmp("all", optarg) == 0) { options.coll_names = tokenize<std::string>(ALL_COLLS_LIST, ','); @@ -598,6 +694,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { should_parse_reductions = true; break; case 'o': options.csv_filepath = std::string(optarg); break; + case 'x': options.show_additional_info = true; break; case 'h': return -1; default: PRINT("failed to parse unknown option"); @@ -623,6 +720,21 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) { errors++; } + if (options.inplace) { + for (auto name : options.coll_names) { + if (name != "allreduce") { + PRINT("inplace is not supported for %s yet", name.c_str()); + errors++; + break; + } + } + } + + if (options.coll_names.empty()) { + PRINT("empty coll list"); + errors++; + } + if (errors > 0) { PRINT("found %d errors while parsing user options", errors); for (int idx = 0; idx < argc; idx++) { @@ -673,6 +785,7 @@ void print_user_options(const user_options_t& options, const ccl::communicator& std::string backend_str = find_str_val(backend_names, options.backend); std::string loop_str = find_str_val(loop_names, options.loop); + std::string iter_policy_str = find_str_val(iter_policy_names, options.iter_policy); #ifdef CCL_ENABLE_SYCL std::string sycl_dev_type_str = find_str_val(sycl_dev_names, options.sycl_dev_type); @@ -687,19 +800,23 @@ void print_user_options(const user_options_t& options, const ccl::communicator& "\n loop: %s" "\n iters: %zu" "\n warmup_iters: %zu" + "\n iter_policy: %s" "\n buf_count: %zu" "\n min_elem_count: %zu" "\n max_elem_count: %zu" "\n elem_counts: %s" "\n check: %d" "\n cache: %d" - /*"\n v2i_ratio: %zu"*/ + "\n inplace: %d" + "\n ranks_per_proc: %zu" +#ifdef CCL_ENABLE_NUMA + "\n numa_node: %s" +#endif /* CCL_ENABLE_NUMA */ #ifdef CCL_ENABLE_SYCL "\n sycl_dev_type: %s" "\n sycl_mem_type: %s" "\n sycl_usm_type: %s" -#endif - "\n ranks_per_proc: %zu" +#endif /* CCL_ENABLE_SYCL */ "\n collectives: %s" "\n datatypes: %s" "\n reductions: %s" @@ -709,19 +826,25 @@ void print_user_options(const user_options_t& options, const ccl::communicator& loop_str.c_str(), options.iters, options.warmup_iters, + iter_policy_str.c_str(), options.buf_count, options.min_elem_count, options.max_elem_count, elem_counts_str.c_str(), options.check_values, options.cache_ops, - /*options.v2i_ratio,*/ + options.inplace, + options.ranks_per_proc, +#ifdef CCL_ENABLE_NUMA + (options.numa_node == DEFAULT_NUMA_NODE) + ? DEFAULT_NUMA_NODE_STR + : std::to_string(options.numa_node).c_str(), +#endif /* CCL_ENABLE_NUMA */ #ifdef CCL_ENABLE_SYCL sycl_dev_type_str.c_str(), sycl_mem_type_str.c_str(), sycl_usm_type_str.c_str(), -#endif - options.ranks_per_proc, +#endif /* CCL_ENABLE_SYCL */ collectives_str.c_str(), datatypes_str.c_str(), reductions_str.c_str(), diff --git a/examples/benchmark/include/coll.hpp b/examples/benchmark/include/coll.hpp index 14da31617..9a8c5d4c8 100644 --- a/examples/benchmark/include/coll.hpp +++ b/examples/benchmark/include/coll.hpp @@ -88,12 +88,13 @@ typedef struct bench_exec_attr { typedef struct bench_init_attr { size_t buf_count; size_t max_elem_count; + int inplace; size_t ranks_per_proc; + int numa_node; #ifdef CCL_ENABLE_SYCL sycl_mem_type_t sycl_mem_type; sycl_usm_type_t sycl_usm_type; #endif - size_t v2i_ratio; } bench_init_attr; /* base polymorph collective wrapper class */ @@ -127,6 +128,10 @@ struct base_coll { } virtual void finalize(size_t elem_count) { + auto dtype = get_dtype(); + if (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16) + return; + auto& transport = transport_data::instance(); auto& comms = transport.get_comms(); auto streams = transport.get_bench_streams(); @@ -181,6 +186,14 @@ struct base_coll { return init_attr.ranks_per_proc; } + int get_inplace() const noexcept { + return init_attr.inplace; + } + + int get_numa_node() const noexcept { + return init_attr.numa_node; + } + // first dim - per buf_count, second dim - per local rank std::vector<std::vector<void*>> send_bufs; std::vector<std::vector<void*>> recv_bufs; diff --git a/examples/benchmark/include/config.hpp b/examples/benchmark/include/config.hpp index 5c350b7fc..78794fa8a 100644 --- a/examples/benchmark/include/config.hpp +++ b/examples/benchmark/include/config.hpp @@ -15,8 +15,9 @@ */ #pragma once -#define ALIGNMENT (4096) -#define DTYPE float +#define REG_MSG_ALIGNMENT (4096) +#define LARGE_MSG_ALIGNMENT (2 * 1024 * 1024) +#define LARGE_MSG_THRESHOLD (1 * 1024 * 1024) #define ALL_COLLS_LIST "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce,reduce_scatter" @@ -26,6 +27,9 @@ #define ALL_REDUCTIONS_LIST "sum,prod,min,max" #define ALL_REDUCTIONS_LIST_WITH_CHECK "sum" +#define COL_WIDTH (14) +#define COL_PRECISION (2) + #ifdef CCL_ENABLE_SYCL #define DEFAULT_BACKEND BACKEND_SYCL #else /* CCL_ENABLE_SYCL */ @@ -34,16 +38,19 @@ #define DEFAULT_LOOP LOOP_REGULAR #define DEFAULT_ITERS (16) #define DEFAULT_WARMUP_ITERS (16) -#define DEFAULT_BUF_COUNT (16) +#define DEFAULT_ITER_POLICY ITER_POLICY_AUTO +#define DEFAULT_BUF_COUNT (1) #define DEFAULT_MIN_ELEM_COUNT (1) #define DEFAULT_MAX_ELEM_COUNT (128) -#define DEFAULT_CHECK_VALUES (1) +#define DEFAULT_CHECK_VALUES (0) #define DEFAULT_CACHE_OPS (1) -#define DEFAULT_V2I_RATIO (128) +#define DEFAULT_INPLACE (0) +#define DEFAULT_RANKS_PER_PROC (1) +#define DEFAULT_NUMA_NODE (-1) +#define DEFAULT_NUMA_NODE_STR "<default>" #define DEFAULT_SYCL_DEV_TYPE SYCL_DEV_GPU #define DEFAULT_SYCL_MEM_TYPE SYCL_MEM_USM #define DEFAULT_SYCL_USM_TYPE SYCL_USM_DEVICE -#define DEFAULT_RANKS_PER_PROC (1) #define DEFAULT_COLL_LIST "allreduce" #define DEFAULT_DTYPES_LIST "float32" diff --git a/examples/benchmark/include/cpu_coll.hpp b/examples/benchmark/include/cpu_coll.hpp index 361898101..176ee958e 100644 --- a/examples/benchmark/include/cpu_coll.hpp +++ b/examples/benchmark/include/cpu_coll.hpp @@ -15,6 +15,10 @@ */ #pragma once +#ifdef CCL_ENABLE_NUMA +#include <numa.h> +#endif /* CCL_ENABLE_NUMA */ + #include "coll.hpp" /* cpu-specific base implementation */ @@ -33,14 +37,15 @@ struct cpu_base_coll : base_coll, protected strategy { for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) { for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) { - result = posix_memalign( - (void**)&(send_bufs[idx][rank_idx]), - ALIGNMENT, - base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier); - result = posix_memalign( - (void**)&(recv_bufs[idx][rank_idx]), - ALIGNMENT, - base_coll::get_max_elem_count() * sizeof(Dtype) * recv_multiplier); + send_bufs[idx][rank_idx] = + alloc_buffer(base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier); + if (base_coll::get_inplace()) { + recv_bufs[idx][rank_idx] = send_bufs[idx][rank_idx]; + } + else { + recv_bufs[idx][rank_idx] = alloc_buffer(base_coll::get_max_elem_count() * + sizeof(Dtype) * recv_multiplier); + } } } @@ -50,10 +55,16 @@ struct cpu_base_coll : base_coll, protected strategy { cpu_base_coll(bench_init_attr init_attr) : cpu_base_coll(init_attr, 1, 1) {} virtual ~cpu_base_coll() { + size_t send_multiplier = coll_strategy::get_send_multiplier(); + size_t recv_multiplier = coll_strategy::get_recv_multiplier(); for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) { for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) { - free(send_bufs[idx][rank_idx]); - free(recv_bufs[idx][rank_idx]); + free_buffer(send_bufs[idx][rank_idx], + base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier); + if (!base_coll::get_inplace()) { + free_buffer(recv_bufs[idx][rank_idx], + base_coll::get_max_elem_count() * sizeof(Dtype) * recv_multiplier); + } } } } @@ -98,8 +109,60 @@ struct cpu_base_coll : base_coll, protected strategy { for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) { memcpy(send_bufs[b_idx][rank_idx], fill_vector.data(), send_bytes); + if (!base_coll::get_inplace()) { + memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes); + } + } + } - memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes); + void* alloc_buffer(size_t bytes) { + void* ptr = nullptr; +#ifdef CCL_ENABLE_NUMA + int numa_node = base_coll::get_numa_node(); + if (numa_node != DEFAULT_NUMA_NODE) { + ASSERT(numa_available() >= 0, "libnuma is not available"); + ASSERT(numa_node <= numa_max_node(), + "requsted NUMA node %d is larger than max NUMA node %d", + numa_node, + numa_max_node()); + + long long free_bytes = 0; + numa_node_size64(numa_node, &free_bytes); + ASSERT(bytes <= (size_t)free_bytes, + "no enough free memory on NUMA node %d, requested %zu, free %lld", + numa_node, + bytes, + free_bytes); + + ptr = numa_alloc_onnode(bytes, numa_node); + ASSERT( + ptr, "failed to allocate buffer with size %zu on NUMA node %d", bytes, numa_node); + } + else +#endif /* CCL_ENABLE_NUMA */ + { + size_t alignment = REG_MSG_ALIGNMENT; + if (bytes >= LARGE_MSG_THRESHOLD) + alignment = LARGE_MSG_ALIGNMENT; + + int result = posix_memalign(&ptr, alignment, bytes); + ASSERT((result == 0) && ptr, "failed to allocate buffer with size %zu", bytes); + } + + return ptr; + } + + void free_buffer(void* ptr, size_t bytes) { +#ifdef CCL_ENABLE_NUMA + int numa_node = base_coll::get_numa_node(); + if (numa_node != DEFAULT_NUMA_NODE) { + ASSERT(numa_available() >= 0, "libnuma is not available"); + numa_free(ptr, bytes); + } + else +#endif /* CCL_ENABLE_NUMA */ + { + free(ptr); } } diff --git a/examples/benchmark/include/sycl_coll.hpp b/examples/benchmark/include/sycl_coll.hpp index 8cb330edb..064a333eb 100644 --- a/examples/benchmark/include/sycl_coll.hpp +++ b/examples/benchmark/include/sycl_coll.hpp @@ -63,8 +63,14 @@ struct sycl_base_coll : base_coll, private strategy { for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) { send_bufs[idx][rank_idx] = allocator.allocate( base_coll::get_max_elem_count() * send_multiplier, usm_alloc_type); - recv_bufs[idx][rank_idx] = allocator.allocate( - base_coll::get_max_elem_count() * recv_multiplier, usm_alloc_type); + + if (base_coll::get_inplace()) { + recv_bufs[idx][rank_idx] = send_bufs[idx][rank_idx]; + } + else { + recv_bufs[idx][rank_idx] = allocator.allocate( + base_coll::get_max_elem_count() * recv_multiplier, usm_alloc_type); + } } } else { @@ -88,7 +94,9 @@ struct sycl_base_coll : base_coll, private strategy { if (base_coll::get_sycl_mem_type() == SYCL_MEM_BUF) { for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) { delete static_cast<sycl_buffer_t<Dtype>*>(send_bufs[idx][rank_idx]); - delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx][rank_idx]); + if (!base_coll::get_inplace()) { + delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx][rank_idx]); + } } } } @@ -159,7 +167,9 @@ struct sycl_base_coll : base_coll, private strategy { .memcpy(send_bufs[b_idx][rank_idx], host_send_buf.data(), send_bytes) .wait(); - stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait(); + if (!base_coll::get_inplace()) { + stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait(); + } } else { stream.get_native() diff --git a/examples/benchmark/include/types.hpp b/examples/benchmark/include/types.hpp index b8723e3c1..9466d84af 100644 --- a/examples/benchmark/include/types.hpp +++ b/examples/benchmark/include/types.hpp @@ -24,16 +24,7 @@ if (comm.rank() == 0) { \ printf(fmt "\n", ##__VA_ARGS__); \ } -#endif //PRINT_BY_ROOT - -#define ASSERT(cond, fmt, ...) \ - do { \ - if (!(cond)) { \ - printf("FAILED\n"); \ - fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \ - throw std::runtime_error("ASSERT FAILED"); \ - } \ - } while (0) +#endif /* PRINT_BY_ROOT */ constexpr std::initializer_list<ccl::datatype> all_dtypes = { ccl::datatype::int8, ccl::datatype::int32, ccl::datatype::int64, ccl::datatype::uint64, @@ -42,6 +33,7 @@ constexpr std::initializer_list<ccl::datatype> all_dtypes = { typedef enum { BACKEND_HOST, BACKEND_SYCL } backend_type_t; typedef enum { LOOP_REGULAR, LOOP_UNORDERED } loop_type_t; +typedef enum { ITER_POLICY_OFF, ITER_POLICY_AUTO } iter_policy_t; typedef enum { SYCL_DEV_HOST, SYCL_DEV_CPU, SYCL_DEV_GPU } sycl_dev_type_t; typedef enum { SYCL_MEM_USM, SYCL_MEM_BUF } sycl_mem_type_t; @@ -53,6 +45,10 @@ std::map<backend_type_t, std::string> backend_names = { std::make_pair(BACKEND_H std::map<loop_type_t, std::string> loop_names = { std::make_pair(LOOP_REGULAR, "regular"), std::make_pair(LOOP_UNORDERED, "unordered") }; +std::map<iter_policy_t, std::string> iter_policy_names = { std::make_pair(ITER_POLICY_OFF, "off"), + std::make_pair(ITER_POLICY_AUTO, + "auto") }; + #ifdef CCL_ENABLE_SYCL std::map<sycl_dev_type_t, std::string> sycl_dev_names = { std::make_pair(SYCL_DEV_HOST, "host"), std::make_pair(SYCL_DEV_CPU, "cpu"), @@ -114,19 +110,21 @@ typedef struct user_options_t { loop_type_t loop; size_t iters; size_t warmup_iters; + iter_policy_t iter_policy; size_t buf_count; size_t min_elem_count; size_t max_elem_count; std::list<size_t> elem_counts; int check_values; int cache_ops; - size_t v2i_ratio; + int inplace; + size_t ranks_per_proc; + int numa_node; #ifdef CCL_ENABLE_SYCL sycl_dev_type_t sycl_dev_type; sycl_mem_type_t sycl_mem_type; sycl_usm_type_t sycl_usm_type; #endif - size_t ranks_per_proc; std::list<std::string> coll_names; std::list<std::string> dtypes; std::list<std::string> reductions; @@ -135,25 +133,28 @@ typedef struct user_options_t { bool min_elem_count_set; bool max_elem_count_set; bool elem_counts_set; + bool show_additional_info; user_options_t() { backend = DEFAULT_BACKEND; loop = DEFAULT_LOOP; iters = DEFAULT_ITERS; warmup_iters = DEFAULT_WARMUP_ITERS; + iter_policy = DEFAULT_ITER_POLICY; buf_count = DEFAULT_BUF_COUNT; min_elem_count = DEFAULT_MIN_ELEM_COUNT; max_elem_count = DEFAULT_MAX_ELEM_COUNT; generate_counts(elem_counts, min_elem_count, max_elem_count); check_values = DEFAULT_CHECK_VALUES; cache_ops = DEFAULT_CACHE_OPS; - v2i_ratio = DEFAULT_V2I_RATIO; + inplace = DEFAULT_INPLACE; + ranks_per_proc = DEFAULT_RANKS_PER_PROC; + numa_node = DEFAULT_NUMA_NODE; #ifdef CCL_ENABLE_SYCL sycl_dev_type = DEFAULT_SYCL_DEV_TYPE; sycl_mem_type = DEFAULT_SYCL_MEM_TYPE; sycl_usm_type = DEFAULT_SYCL_USM_TYPE; #endif - ranks_per_proc = DEFAULT_RANKS_PER_PROC; coll_names = tokenize<std::string>(DEFAULT_COLL_LIST, ','); dtypes = tokenize<std::string>(DEFAULT_DTYPES_LIST, ','); reductions = tokenize<std::string>(DEFAULT_REDUCTIONS_LIST, ','); @@ -162,6 +163,7 @@ typedef struct user_options_t { min_elem_count_set = false; max_elem_count_set = false; elem_counts_set = false; + show_additional_info = false; } } user_options_t; diff --git a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp index 4c89a1c9c..a0d289aef 100644 --- a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp +++ b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp @@ -37,7 +37,7 @@ struct cpu_allreduce_coll : cpu_base_coll<Dtype, allreduce_strategy_impl> { for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) { for (size_t e_idx = 0; e_idx < elem_count; e_idx++) { value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx]; - if (value != sbuf_expected) { + if (!base_coll::get_inplace() && (value != sbuf_expected)) { std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx " << rank_idx << ", elem_idx " << e_idx << ", expected " << sbuf_expected << ", got " << value << std::endl; diff --git a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp index 52b5aaf98..400b3e53c 100644 --- a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp +++ b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp @@ -69,7 +69,7 @@ struct sycl_allreduce_coll : sycl_base_coll<Dtype, allreduce_strategy_impl> { for (size_t e_idx = 0; e_idx < elem_count; e_idx++) { value = host_send_buf[e_idx]; - if (value != sbuf_expected) { + if (!base_coll::get_inplace() && (value != sbuf_expected)) { std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx " << rank_idx << ", elem_idx " << e_idx << ", expected " << sbuf_expected << ", got " << value << std::endl; diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp index 611e65598..558c87092 100644 --- a/examples/benchmark/src/benchmark.cpp +++ b/examples/benchmark/src/benchmark.cpp @@ -78,79 +78,92 @@ void do_regular(ccl::communicator& service_comm, PRINT_BY_ROOT(service_comm, "#------------------------------------------------------------\n" "# Benchmarking: %s\n" - "# processes: %d\n" + "# #processes: %d\n" "#------------------------------------------------------------\n", scolls.str().c_str(), service_comm.size()); - if (options.buf_count == 1) { - PRINT_BY_ROOT(service_comm, "%10s %12s %11s", "#bytes", "avg[usec]", "stddev[%]"); - } - else { - PRINT_BY_ROOT(service_comm, - "%10s %13s %18s %11s", - "#bytes", - "avg[usec]", - "avg_per_buf[usec]", - "stddev[%]"); + if (service_comm.rank() == 0) { + std::stringstream ss; + ss << std::right << std::setw(COL_WIDTH) << "#bytes" << std::setw(COL_WIDTH) + << "#repetitions" << std::setw(COL_WIDTH) << "t_min[usec]" + << std::setw(COL_WIDTH) << "t_max[usec]" << std::setw(COL_WIDTH) << "t_avg[usec]" + << std::setw(COL_WIDTH - 3) << "stddev[%]"; + + if (options.show_additional_info) { + ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]"; + } + ss << std::endl; + printf("%s", ss.str().c_str()); } for (auto& count : options.elem_counts) { - size_t iter_count = - get_iter_count(count * ccl::get_datatype_size(dtype), options.iters); + size_t iter_count = get_iter_count( + count * ccl::get_datatype_size(dtype), options.iters, options.iter_policy); - size_t warmup_iter_count = - get_iter_count(count * ccl::get_datatype_size(dtype), options.warmup_iters); + size_t warmup_iter_count = get_iter_count(count * ccl::get_datatype_size(dtype), + options.warmup_iters, + options.iter_policy); try { // we store times for each collective separately, // but aggregate over buffers and iterations - std::vector<double> coll_timers(colls.size(), 0); + std::vector<double> total_timers(colls.size(), 0); + std::vector<double> wait_timers(colls.size(), 0); for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) { auto& coll = colls[coll_idx]; - - double t1 = 0, t2 = 0, t = 0; - - if (options.check_values) { - coll->prepare(count); - } + double coll_time = 0, wait_time = 0; ccl::barrier(service_comm); for (size_t iter_idx = 0; iter_idx < (iter_count + warmup_iter_count); iter_idx++) { - t1 = when(); + if (options.check_values) { + coll->prepare(count); + ccl::barrier(service_comm); + } + double coll_start_time = when(); for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) { match_id_stream << "coll_" << coll->name() << "_" << coll_idx - << "_count_" << count << "_buf_" << buf_idx; + << "_count_" << count << "_buf_" << buf_idx + << "_dt_" << dtype_name << "_rt_" << reduction; bench_attr.set<ccl::operation_attr_id::match_id>( ccl::string_class(match_id_stream.str())); match_id_stream.str(""); coll->start(count, buf_idx, bench_attr, reqs); } + double coll_end_time = when(); + double wait_start_time = when(); for (auto& req : reqs) { req.wait(); } + double wait_end_time = when(); reqs.clear(); - t2 = when(); - if (iter_idx >= warmup_iter_count) { - t += (t2 - t1); + coll_time += coll_end_time - coll_start_time; + wait_time += wait_end_time - wait_start_time; } - } - if (options.check_values) { - coll->finalize(count); + if (options.check_values) { + coll->finalize(count); + } } - coll_timers[coll_idx] += t; + total_timers[coll_idx] += coll_time + wait_time; + wait_timers[coll_idx] += wait_time; } - print_timings( - service_comm, coll_timers, options, count, iter_count, dtype, reduction_op); + print_timings(service_comm, + total_timers, + wait_timers, + options, + count, + iter_count, + dtype, + reduction_op); } catch (const std::exception& ex) { ASSERT(0, "error on count %zu, reason: %s", count, ex.what()); @@ -158,8 +171,11 @@ void do_regular(ccl::communicator& service_comm, } } } + + PRINT_BY_ROOT(service_comm, "\n# All done\n"); } +/* TODO: merge with do_regular */ void do_unordered(ccl::communicator& service_comm, bench_exec_attr& bench_attr, coll_list_t& all_colls, @@ -487,14 +503,17 @@ int main(int argc, char* argv[]) { ccl::communicator& service_comm = transport.get_service_comm(); + print_user_options(options, service_comm); + init_attr.buf_count = options.buf_count; init_attr.max_elem_count = options.max_elem_count; init_attr.ranks_per_proc = options.ranks_per_proc; + init_attr.inplace = options.inplace; + init_attr.numa_node = options.numa_node; #ifdef CCL_ENABLE_SYCL init_attr.sycl_mem_type = options.sycl_mem_type; init_attr.sycl_usm_type = options.sycl_usm_type; -#endif - init_attr.v2i_ratio = options.v2i_ratio; +#endif /* CCL_ENABLE_SYCL */ try { create_all_colls(init_attr, options, colls); @@ -510,14 +529,6 @@ int main(int argc, char* argv[]) { bench_exec_attr bench_attr{}; bench_attr.init_all(); - print_user_options(options, service_comm); - - if (options.coll_names.empty()) { - PRINT_BY_ROOT(service_comm, "empty coll list"); - print_help_usage(argv[0]); - return -1; - } - ccl::barrier(service_comm); switch (options.loop) { @@ -525,15 +536,26 @@ int main(int argc, char* argv[]) { // open and truncate CSV file if csv-output is requested if (service_comm.rank() == 0 && !options.csv_filepath.empty()) { std::ofstream csvf; - csvf.open(options.csv_filepath, std::ios::trunc); + csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::trunc); if (!csvf.is_open()) { std::cerr << "Cannot open CSV file for writing: " << options.csv_filepath << std::endl; - return -1; + abort(); } // write header (column names) - csvf << "#ranks,collective,reduction,type,typesize,#elements/buffer,#buffers,time" - << std::endl; + csvf << "#ranks," + << "collective," + << "reduction," + << "dtype," + << "dtype_size," + << "#elements/buffer," + << "#buffers," + << "#repetitions," + << "t_min[usec]," + << "t_max[usec]," + << "t_avg[usec]," + << "stddev[%]," + << "wait_t_avg[usec]" << std::endl; csvf.close(); } ccl::barrier(service_comm); @@ -549,6 +571,7 @@ int main(int argc, char* argv[]) { default: ASSERT(0, "unknown loop %d", options.loop); break; } + colls.clear(); transport.reset_comms(); return 0; diff --git a/examples/benchmark/src/declarations.hpp b/examples/benchmark/src/declarations.hpp old mode 100755 new mode 100644 diff --git a/examples/common/CMakeLists.txt b/examples/common/CMakeLists.txt index ee14b6fc3..c95d44bc6 100644 --- a/examples/common/CMakeLists.txt +++ b/examples/common/CMakeLists.txt @@ -25,6 +25,7 @@ foreach(src ${sources}) target_link_libraries(${executable} PUBLIC rt) target_link_libraries(${executable} PUBLIC m) target_link_libraries(${executable} PUBLIC dl) + target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/) target_link_libraries(${executable} PUBLIC mpi) install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common OPTIONAL) endforeach() diff --git a/examples/cpu/CMakeLists.txt b/examples/cpu/CMakeLists.txt index 58099643c..403a409d4 100644 --- a/examples/cpu/CMakeLists.txt +++ b/examples/cpu/CMakeLists.txt @@ -25,7 +25,7 @@ foreach(src ${sources}) target_link_libraries(${executable} PUBLIC dl) target_link_libraries(${executable} PUBLIC pthread) target_link_libraries(${executable} PUBLIC stdc++) - target_link_libraries(${executable} PRIVATE m) + target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/) target_link_libraries(${executable} PUBLIC mpi) install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu OPTIONAL) endforeach() diff --git a/examples/cpu/allgatherv.cpp b/examples/cpu/allgatherv.cpp index 4c1d0fa44..d34604e26 100644 --- a/examples/cpu/allgatherv.cpp +++ b/examples/cpu/allgatherv.cpp @@ -132,8 +132,12 @@ int main() { run_collective_vector( "warmup_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr); + ccl::string_class regular_match_id = std::to_string(msg_count); + ccl::string_class vector_match_id = regular_match_id + std::string("_vector"); + attr.set<ccl::operation_attr_id::match_id>(regular_match_id); attr.set<ccl::operation_attr_id::to_cache>(true); run_collective("persistent_allgatherv", send_buf, recv_buf, recv_counts, comm, attr); + attr.set<ccl::operation_attr_id::match_id>(vector_match_id); run_collective_vector( "persistent_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr); diff --git a/examples/cpu/custom_allreduce.cpp b/examples/cpu/custom_allreduce.cpp index 970730dc6..54a24ed4a 100644 --- a/examples/cpu/custom_allreduce.cpp +++ b/examples/cpu/custom_allreduce.cpp @@ -31,8 +31,8 @@ typedef void (*fill_fn_t)(void*, size_t, size_t); typedef int (*check_fn_t)(void*, size_t, expected_fn_t); #define RUN_COLLECTIVE(start_cmd, fill_fn, check_fn, expected_fn, name) \ - t = 0; \ do { \ + double t1 = 0, t2 = 0, t = 0; \ for (int iter_idx = 0; iter_idx < ITERS; iter_idx++) { \ global_match_id = match_id; \ fill_fn(send_buf, MSG_SIZE_COUNT, rank + 1); \ diff --git a/examples/external_launcher/run.sh b/examples/external_launcher/run.sh index aae636f69..797cb4ecd 100755 --- a/examples/external_launcher/run.sh +++ b/examples/external_launcher/run.sh @@ -16,7 +16,8 @@ # BASENAME=`basename $0 .sh` -TIMEOUT=600 + +cmd_timeout=600 echo_log() { @@ -142,6 +143,20 @@ parse_arguments() echo_log "-----------------------------------------------------------" } +run_cmd() +{ + host="$1" + cmd="$2" + timeout_prefix="$3" + + if [[ "${host}" == "localhost" ]] + then + eval ${timeout_prefix} $cmd& + else + ${timeout_prefix} ssh ${host} $cmd& + fi +} + cleanup_hosts() { hostlist=$1 @@ -151,7 +166,7 @@ cleanup_hosts() do echo "host ${host}" cmd="killall -9 external_launcher run_binary.sh" - ssh ${host} $cmd + run_cmd ${host} "${cmd}" done } @@ -180,7 +195,8 @@ run_binary() fi elif [ "$kvs_mode" == "ip_port" ] then - kvs_param=`ssh ${hostlist[0]} hostname -I | awk '{print $1}'` + cmd="hostname -I | sed -e 's/\s.*$//'" + kvs_param=`run_cmd ${hostlist[0]} "${cmd}"` fi host_idx=0 @@ -203,7 +219,8 @@ run_binary() cmd="${cmd} -mv ${I_MPI_ROOT}/env/vars.sh" fi - timeout -k $((TIMEOUT))s $((TIMEOUT))s ssh ${host} $cmd& + timeout_prefix="timeout -k $((cmd_timeout))s $((cmd_timeout))s" + run_cmd ${host} "${cmd}" "${timeout_prefix}" done host_idx=$((host_idx + 1)) done @@ -253,9 +270,9 @@ run() run_binary $mode exec_time="$((`date +%s`-$exec_time))" - if [ "$exec_time" -ge "$TIMEOUT" ]; + if [ "$exec_time" -ge "$cmd_timeout" ]; then - echo -e "${RED}FAILED: Timeout ($exec_time > $TIMEOUT)${NC}" + echo -e "${RED}FAILED: Timeout ($exec_time > $cmd_timeout)${NC}" exit 1 fi done diff --git a/examples/include/base.hpp b/examples/include/base.hpp index c507fcbc2..2421fb309 100644 --- a/examples/include/base.hpp +++ b/examples/include/base.hpp @@ -26,8 +26,10 @@ #include <mpi.h> #include <stdexcept> #include <stdio.h> +#include <sys/syscall.h> #include <sys/time.h> #include <vector> +#include <unistd.h> #ifdef CCL_ENABLE_SYCL #include <CL/sycl.hpp> @@ -35,6 +37,8 @@ using namespace cl::sycl; using namespace cl::sycl::access; #endif /* CCL_ENABLE_SYCL */ +#define GETTID() syscall(SYS_gettid) + #define ITERS (16) #define COLL_ROOT (0) #define MSG_SIZE_COUNT (6) @@ -51,7 +55,15 @@ using namespace cl::sycl::access; do { \ if (!(cond)) { \ printf("FAILED\n"); \ - fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \ + fprintf(stderr, \ + "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \ + GETTID(), \ + __FILE__, \ + __FUNCTION__, \ + __LINE__, \ + #cond, \ + ##__VA_ARGS__); \ + fflush(stderr); \ throw std::runtime_error("ASSERT FAILED"); \ } \ } while (0) @@ -93,27 +105,13 @@ using namespace cl::sycl::access; PRINT_BY_ROOT(comm, "PASSED"); \ } while (0) -double t1, t2, t; - -double when(void) { - struct timeval tv; - static struct timeval tv_base; - static int is_first = 1; - - if (gettimeofday(&tv, NULL)) { - perror("gettimeofday"); - return 0; - } - - if (is_first) { - tv_base = tv; - is_first = 0; - } - - return (double)(tv.tv_sec - tv_base.tv_sec) * 1.0e6 + (double)(tv.tv_usec - tv_base.tv_usec); +inline double when(void) { + auto time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration<double, std::micro>(time.time_since_epoch()); + return duration.count(); } -void mpi_finalize() { +inline void mpi_finalize() { int is_finalized = 0; MPI_Finalized(&is_finalized); diff --git a/examples/include/base_utils.hpp b/examples/include/base_utils.hpp index b62566f53..5dd68a1dc 100644 --- a/examples/include/base_utils.hpp +++ b/examples/include/base_utils.hpp @@ -112,7 +112,7 @@ void ccl_tuple_for_each_indexed(functor f, const FunctionArgs&... args) { namespace utils { template <typename T> -void str_to_array(const char* input, std::vector<T>& output, char delimiter) { +inline void str_to_array(const char* input, std::vector<T>& output, char delimiter) { if (!input) { return; } @@ -126,7 +126,7 @@ void str_to_array(const char* input, std::vector<T>& output, char delimiter) { } } template <> -void str_to_array(const char* input, std::vector<std::string>& output, char delimiter) { +inline void str_to_array(const char* input, std::vector<std::string>& output, char delimiter) { std::string processes_input(input); processes_input.erase(std::remove_if(processes_input.begin(), diff --git a/examples/include/bf16.hpp b/examples/include/bf16.hpp index 778b491b1..72ab78955 100644 --- a/examples/include/bf16.hpp +++ b/examples/include/bf16.hpp @@ -87,7 +87,7 @@ void convert_fp32_to_bf16(const void* src, void* dst) __attribute__((target("avx void convert_fp32_to_bf16(const void* src, void* dst) { #ifdef CCL_BF16_AVX512BF_COMPILER if (is_avx512bf_enabled()) { - _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src))); + _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src))); } else #endif diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp index 5fac895d9..c7861d2a7 100644 --- a/examples/include/sycl_base.hpp +++ b/examples/include/sycl_base.hpp @@ -73,7 +73,7 @@ inline bool check_sycl_usm(queue& q, usm::alloc alloc_type) { return ret; } -std::string get_preferred_gpu_platform_name() { +inline std::string get_preferred_gpu_platform_name() { std::string filter; std::string result; @@ -130,7 +130,7 @@ std::string get_preferred_gpu_platform_name() { return result; } -std::vector<sycl::device> create_sycl_gpu_devices() { +inline std::vector<sycl::device> create_sycl_gpu_devices() { constexpr char dev_prefix[] = "-- "; constexpr char sub_dev_prefix[] = "---- "; @@ -138,7 +138,8 @@ std::vector<sycl::device> create_sycl_gpu_devices() { auto plaform_list = sycl::platform::get_platforms(); auto preferred_platform_name = get_preferred_gpu_platform_name(); - cout << "preferred platform: [" << preferred_platform_name << "]\n"; + std::stringstream ss; + ss << "preferred platform: [" << preferred_platform_name << "]\n"; for (const auto& platform : plaform_list) { auto platform_name = platform.get_info<sycl::info::platform::name>(); @@ -146,7 +147,7 @@ std::vector<sycl::device> create_sycl_gpu_devices() { if (platform_name.compare(preferred_platform_name) != 0) continue; - cout << "platform: [" << platform_name << "]\n"; + ss << "platform: [" << platform_name << "]\n"; auto device_list = platform.get_devices(); @@ -154,7 +155,7 @@ std::vector<sycl::device> create_sycl_gpu_devices() { auto device_name = device.get_info<cl::sycl::info::device::name>(); if (!device.is_gpu()) { - cout << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n"; + ss << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n"; continue; } @@ -164,9 +165,9 @@ std::vector<sycl::device> create_sycl_gpu_devices() { part_props.end(), info::partition_property::partition_by_affinity_domain) == part_props.end()) { - cout << dev_prefix << "device [" << device_name - << "] does not support partition by affinity domain" - << ", use root device\n"; + ss << dev_prefix << "device [" << device_name + << "] does not support partition by affinity domain" + << ", use root device\n"; result.push_back(device); continue; } @@ -178,16 +179,16 @@ std::vector<sycl::device> create_sycl_gpu_devices() { part_affinity_domains.end(), info::partition_affinity_domain::next_partitionable) == part_affinity_domains.end()) { - cout << dev_prefix << "device [" << device_name - << "] does not support next_partitionable affinity domain" - << ", use root device\n"; + ss << dev_prefix << "device [" << device_name + << "] does not support next_partitionable affinity domain" + << ", use root device\n"; result.push_back(device); continue; } - cout << dev_prefix << "device [" << device_name << "] should provide " - << device.template get_info<info::device::partition_max_sub_devices>() - << " sub-devices\n"; + ss << dev_prefix << "device [" << device_name << "] should provide " + << device.template get_info<info::device::partition_max_sub_devices>() + << " sub-devices\n"; auto sub_devices = device.create_sub_devices<info::partition_property::partition_by_affinity_domain>( @@ -195,19 +196,19 @@ std::vector<sycl::device> create_sycl_gpu_devices() { if (sub_devices.empty()) { /* TODO: remove when SYCL/L0 sub-devices will be supported */ - cout << dev_prefix << "device [" << device_name << "] does not provide sub-devices" - << ", use root device\n"; + ss << dev_prefix << "device [" << device_name << "] does not provide sub-devices" + << ", use root device\n"; result.push_back(device); continue; } - cout << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size() - << " sub-devices\n"; + ss << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size() + << " sub-devices\n"; result.insert(result.end(), sub_devices.begin(), sub_devices.end()); for (auto idx = 0; idx < sub_devices.size(); idx++) { - cout << sub_dev_prefix << "sub-device " << idx << ": [" - << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n"; + ss << sub_dev_prefix << "sub-device " << idx << ": [" + << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n"; } } } @@ -216,13 +217,14 @@ std::vector<sycl::device> create_sycl_gpu_devices() { throw std::runtime_error("no GPU devices found"); } - cout << "found: " << result.size() << " GPU device(s)\n"; + ss << "found: " << result.size() << " GPU device(s)\n"; + printf("%s", ss.str().c_str()); return result; } -std::vector<sycl::queue> create_sycl_queues(const std::string& device_type, - const std::vector<int>& ranks) { +inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_type, + const std::vector<int>& ranks) { std::vector<sycl::device> devices; try { @@ -338,7 +340,7 @@ inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) { } } -bool handle_exception(queue& q) { +inline bool handle_exception(queue& q) { try { q.wait_and_throw(); } @@ -349,7 +351,7 @@ bool handle_exception(queue& q) { return true; } -usm::alloc usm_alloc_type_from_string(const string& str) { +inline usm::alloc usm_alloc_type_from_string(const string& str) { const map<string, usm::alloc> names{ { { "host", usm::alloc::host }, { "device", usm::alloc::device }, @@ -368,7 +370,7 @@ usm::alloc usm_alloc_type_from_string(const string& str) { return it->second; } -std::pair<usm::alloc, std::string> take_usm_type(const int argc, char* str_type) { +inline std::pair<usm::alloc, std::string> take_usm_type(const int argc, char* str_type) { std::map<usm::alloc, std::string> map_usm_type; auto usm_alloc_type = usm::alloc::shared; auto str_usm_alloc_type = "shared"; @@ -404,7 +406,11 @@ struct buf_allocator { else if (alloc_type == usm::alloc::shared) ptr = aligned_alloc_shared<T>(alignment, count, q); else - throw std::runtime_error(string(__PRETTY_FUNCTION__) + "unexpected alloc_type"); + throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - unexpected alloc_type"); + + if (!ptr) { + throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - failed to allocate buffer"); + } auto it = memory_storage.find(ptr); if (it != memory_storage.end()) { @@ -415,9 +421,10 @@ struct buf_allocator { auto pointer_type = sycl::get_pointer_type(ptr, q.get_context()); if (pointer_type != alloc_type) - throw std::runtime_error( - string(__PRETTY_FUNCTION__) + "pointer_type " + std::to_string((int)pointer_type) + - " doesn't match with requested " + std::to_string((int)alloc_type)); + throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - pointer_type " + + std::to_string((int)pointer_type) + + " doesn't match with requested " + + std::to_string((int)alloc_type)); return ptr; } diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index e336d8be1..4bff71065 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -25,7 +25,7 @@ foreach(src ${sources}) target_link_libraries(${executable} PUBLIC rt) target_link_libraries(${executable} PUBLIC m) target_link_libraries(${executable} PRIVATE ccl) - target_link_libraries(${executable} PRIVATE m) + target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/) target_link_libraries(${executable} PUBLIC mpi) target_link_libraries(${executable} PRIVATE ${COMPUTE_BACKEND_TARGET_NAME}) install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl OPTIONAL) diff --git a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp index 9380a064f..935c6f1c0 100644 --- a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp +++ b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp @@ -98,24 +98,14 @@ int main(int argc, char *argv[]) { }); }); - /* create dependency vector */ - vector<ccl::event> events; - // events.push_back(ccl::create_event(e)); - - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke allgatherv */ auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>(); - ccl::allgatherv(send_buf, - send_count, - recv_buf, - recv_counts, - ccl::datatype::int32, - comm, - stream, - attr, - events) + ccl::allgatherv( + send_buf, send_count, recv_buf, recv_counts, ccl::datatype::int32, comm, stream, attr, deps) .wait(); /* open recv_buf and check its correctness on the device side */ diff --git a/examples/sycl/sycl_allgatherv_test.cpp b/examples/sycl/sycl_allgatherv_test.cpp index 7ac3a48b0..3b2c9f764 100644 --- a/examples/sycl/sycl_allgatherv_test.cpp +++ b/examples/sycl/sycl_allgatherv_test.cpp @@ -87,23 +87,18 @@ int main(int argc, char *argv[]) { } /* open send_buf and modify it on the device side */ - auto e = q.submit([&](auto &h) { + q.submit([&](auto &h) { accessor send_buf_acc(send_buf, h, write_only); h.parallel_for(count, [=](auto id) { send_buf_acc[id] += 1; }); }); - /* create dependency vector */ - vector<ccl::event> events; - //events.push_back(ccl::create_event(e)); - if (!handle_exception(q)) return -1; /* invoke allagtherv */ - auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>(); - ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, events).wait(); + ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream).wait(); /* open recv_buf and check its correctness on the device side */ q.submit([&](auto &h) { diff --git a/examples/sycl/sycl_allgatherv_usm_test.cpp b/examples/sycl/sycl_allgatherv_usm_test.cpp index a6013485a..3e95c3d1c 100644 --- a/examples/sycl/sycl_allgatherv_usm_test.cpp +++ b/examples/sycl/sycl_allgatherv_usm_test.cpp @@ -90,16 +90,13 @@ int main(int argc, char *argv[]) { }); }); - /* create dependency vector */ - vector<ccl::event> events; - // events.push_back(ccl::create_event(e)); - - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke allagtherv */ auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>(); - ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, events).wait(); + ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ q.submit([&](auto &h) { diff --git a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp index 4c0605ba2..55bfd3fd2 100644 --- a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp +++ b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp @@ -80,11 +80,13 @@ int main(int argc, char *argv[]) { }); }); - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke allreduce */ - ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream).wait(); + auto attr = ccl::create_operation_attr<ccl::allreduce_attr>(); + ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ buffer<int> check_buf(count); diff --git a/examples/sycl/sycl_allreduce_test.cpp b/examples/sycl/sycl_allreduce_test.cpp index 6200b3c33..1f94bfbdb 100644 --- a/examples/sycl/sycl_allreduce_test.cpp +++ b/examples/sycl/sycl_allreduce_test.cpp @@ -81,8 +81,7 @@ int main(int argc, char *argv[]) { }); }); - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel, dependency will be resolved by sycl::buffer */ /* invoke allreduce */ ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream).wait(); diff --git a/examples/sycl/sycl_allreduce_usm_test.cpp b/examples/sycl/sycl_allreduce_usm_test.cpp index e2fceb44a..5065a3d39 100644 --- a/examples/sycl/sycl_allreduce_usm_test.cpp +++ b/examples/sycl/sycl_allreduce_usm_test.cpp @@ -82,11 +82,13 @@ int main(int argc, char *argv[]) { }); }); - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke allreduce */ - ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream).wait(); + auto attr = ccl::create_operation_attr<ccl::allreduce_attr>(); + ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ buffer<int> check_buf(count); diff --git a/examples/sycl/sycl_alltoall_usm_test.cpp b/examples/sycl/sycl_alltoall_usm_test.cpp index 8fa744a97..ecb75538f 100644 --- a/examples/sycl/sycl_alltoall_usm_test.cpp +++ b/examples/sycl/sycl_alltoall_usm_test.cpp @@ -75,18 +75,20 @@ int main(int argc, char *argv[]) { auto recv_buf = allocator.allocate(count * size, usm_alloc_type); /* open buffers and modify them on the device side */ - q.submit([&](auto &h) { + auto e = q.submit([&](auto &h) { h.parallel_for(count * size, [=](auto id) { send_buf[id] = id / count + 1; recv_buf[id] = -1; }); }); - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke alltoall */ - ccl::alltoall(send_buf, recv_buf, count, comm, stream).wait(); + auto attr = ccl::create_operation_attr<ccl::alltoall_attr>(); + ccl::alltoall(send_buf, recv_buf, count, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ buffer<int> check_buf(count * size); diff --git a/examples/sycl/sycl_alltoallv_usm_test.cpp b/examples/sycl/sycl_alltoallv_usm_test.cpp index 5f23ad973..36b2b2d1d 100644 --- a/examples/sycl/sycl_alltoallv_usm_test.cpp +++ b/examples/sycl/sycl_alltoallv_usm_test.cpp @@ -78,18 +78,20 @@ int main(int argc, char *argv[]) { vector<size_t> recv_counts(size, count); /* open buffers and modify them on the device side */ - q.submit([&](auto &h) { + auto e = q.submit([&](auto &h) { h.parallel_for(count * size, [=](auto id) { send_buf[id] = id / count + 1; recv_buf[id] = -1; }); }); - if (!handle_exception(q)) - return -1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + deps.push_back(ccl::create_event(e)); /* invoke alltoall */ - ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream).wait(); + auto attr = ccl::create_operation_attr<ccl::alltoallv_attr>(); + ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream, attr, deps).wait(); /* open recv_buf and check its correctness on the device side */ buffer<int> check_buf(count * size); diff --git a/examples/sycl/sycl_broadcast_test.cpp b/examples/sycl/sycl_broadcast_test.cpp index 1976afdd5..d03d6e33d 100644 --- a/examples/sycl/sycl_broadcast_test.cpp +++ b/examples/sycl/sycl_broadcast_test.cpp @@ -63,24 +63,21 @@ int main(int argc, char *argv[]) { /* create buffers */ buffer<int> buf(count); - { + if (rank == root_rank) { /* open buf and initialize it on the host side */ host_accessor send_buf_acc(buf, write_only); for (i = 0; i < count; i++) { - if (rank == root_rank) - send_buf_acc[i] = rank + 10; - else - send_buf_acc[i] = 0; + send_buf_acc[i] = 10; } - } - /* open buf and modify it on the device side */ - q.submit([&](auto &h) { - accessor send_buf_acc(buf, h, write_only); - h.parallel_for(count, [=](auto id) { - send_buf_acc[id] += 1; + /* open buf and modify it on the device side */ + q.submit([&](auto &h) { + accessor send_buf_acc(buf, h, write_only); + h.parallel_for(count, [=](auto id) { + send_buf_acc[id] += 1; + }); }); - }); + } if (!handle_exception(q)) return -1; @@ -92,7 +89,7 @@ int main(int argc, char *argv[]) { q.submit([&](auto &h) { accessor recv_buf_acc(buf, h, write_only); h.parallel_for(count, [=](auto id) { - if (recv_buf_acc[id] != root_rank + 11) { + if (recv_buf_acc[id] != 11) { recv_buf_acc[id] = -1; } }); diff --git a/examples/sycl/sycl_broadcast_usm_test.cpp b/examples/sycl/sycl_broadcast_usm_test.cpp index 78b95af82..1f47abfc8 100644 --- a/examples/sycl/sycl_broadcast_usm_test.cpp +++ b/examples/sycl/sycl_broadcast_usm_test.cpp @@ -74,31 +74,29 @@ int main(int argc, char *argv[]) { /* create buffers */ auto buf = allocator.allocate(count, usm_alloc_type); - /* open buffers and modify them on the device side */ - q.submit([&](auto &h) { - h.parallel_for(count, [=](auto id) { - if (rank == root_rank) { - buf[id] = root_rank + 10; - } - else { - buf[id] = 0; - } - buf[id] += 1; + /* do not wait completion of kernel and provide it as dependency for operation */ + vector<ccl::event> deps; + + if (rank == root_rank) { + /* open buffers and modify them on the device side */ + auto e = q.submit([&](auto &h) { + h.parallel_for(count, [=](auto id) { + buf[id] = 10; + }); }); - }); - - if (!handle_exception(q)) - return -1; + deps.push_back(ccl::create_event(e)); + } /* invoke broadcast */ - ccl::broadcast(buf, count, root_rank, comm, stream).wait(); + auto attr = ccl::create_operation_attr<ccl::broadcast_attr>(); + ccl::broadcast(buf, count, root_rank, comm, stream, attr, deps).wait(); /* open buf and check its correctness on the device side */ buffer<int> check_buf(count * size); q.submit([&](auto &h) { accessor check_buf_acc(check_buf, h, write_only); h.parallel_for(count, [=](auto id) { - if (buf[id] != root_rank + 11) { + if (buf[id] != 10) { check_buf_acc[id] = -1; } }); diff --git a/include/oneapi/ccl/communicator.hpp b/include/oneapi/ccl/communicator.hpp index 194403d2c..b2046f235 100644 --- a/include/oneapi/ccl/communicator.hpp +++ b/include/oneapi/ccl/communicator.hpp @@ -102,7 +102,7 @@ class communicator final : public ccl_api_base_movable<communicator, template <class... attr_val_type> stream create_stream(attr_val_type&&... avs) { // return stream::create_stream_from_attr(get_device(), get_context(), std::forward<attr_val_type>(avs)...); - throw; + throw ccl::unsupported("API", "create_stream"); } communicator split(const comm_split_attr& attr); diff --git a/include/oneapi/ccl/config.h.in b/include/oneapi/ccl/config.h.in index 3980df027..1f2ad4e5f 100644 --- a/include/oneapi/ccl/config.h.in +++ b/include/oneapi/ccl/config.h.in @@ -46,6 +46,3 @@ /* Auto-generated configuration settings for multi GPU support*/ #cmakedefine MULTI_GPU_SUPPORT - -/* Configuration setting for truncate v/s RNE rounding mode */ -#cmakedefine CCL_GPU_BF16_TRUNCATE diff --git a/include/oneapi/ccl/environment.hpp b/include/oneapi/ccl/environment.hpp index b087bffcc..3956dff89 100644 --- a/include/oneapi/ccl/environment.hpp +++ b/include/oneapi/ccl/environment.hpp @@ -179,12 +179,6 @@ class environment { return event::create_from_native(native_event); } - template <class event_handle_type, - class = typename std::enable_if<is_event_supported<event_handle_type>()>::type> - event create_event(event_handle_type& native_event_handle, event::context_t& context) { - return event::create_from_native(native_event_handle, context); - } - /******************** STREAM ********************/ template <class native_stream_type, diff --git a/include/oneapi/ccl/exception.hpp b/include/oneapi/ccl/exception.hpp index a5d03b400..6de627142 100644 --- a/include/oneapi/ccl/exception.hpp +++ b/include/oneapi/ccl/exception.hpp @@ -44,7 +44,7 @@ class exception : public std::exception { msg = std::string("oneCCL: ") + std::string(info); } - const char *what() const noexcept { + const char *what() const noexcept override { return msg.c_str(); } }; diff --git a/include/oneapi/ccl/native_device_api/interop_utils.hpp b/include/oneapi/ccl/native_device_api/interop_utils.hpp index 946babad0..02bd77083 100644 --- a/include/oneapi/ccl/native_device_api/interop_utils.hpp +++ b/include/oneapi/ccl/native_device_api/interop_utils.hpp @@ -36,6 +36,7 @@ using assoc_result = std::tuple<usm_support_mode, const void*, std::string>; enum assoc_result_index { SUPPORT_MODE = 0, POINTER_VALUE, ERROR_CAUSE }; #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL) +// TODO: move to src assoc_result check_assoc_device_memory(const void* mem, const ccl::unified_device_type::ccl_native_t& device, const ccl::unified_context_type::ccl_native_t& ctx); diff --git a/include/oneapi/ccl/native_device_api/l0/base.hpp b/include/oneapi/ccl/native_device_api/l0/base.hpp index 74fa79d85..16cb6942d 100644 --- a/include/oneapi/ccl/native_device_api/l0/base.hpp +++ b/include/oneapi/ccl/native_device_api/l0/base.hpp @@ -34,6 +34,7 @@ namespace native { * Base RAII L0 handles wrappper * support serialize/deserialize concept */ + template <class handle_type, class resource_owner, class cl_context> class cl_base { public: diff --git a/include/oneapi/ccl/native_device_api/l0/context.hpp b/include/oneapi/ccl/native_device_api/l0/context.hpp index 24ae091be..4a49bdc96 100644 --- a/include/oneapi/ccl/native_device_api/l0/context.hpp +++ b/include/oneapi/ccl/native_device_api/l0/context.hpp @@ -25,6 +25,8 @@ struct ccl_device_platform; struct ccl_device_driver; struct ccl_subdevice; struct ccl_device; +struct ccl_event_pool_holder; +class ccl_event_pool; // TODO not thread-safe!!! struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, ccl_context>, @@ -42,7 +44,10 @@ struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, cc template <class elem_t> using host_memory_ptr = std::shared_ptr<host_memory<elem_t>>; + using ccl_event_pool_ptr = std::shared_ptr<ccl_event_pool>; + ccl_context(handle_t h, owner_ptr_t&& platform); + ~ccl_context(); static const ze_host_mem_alloc_desc_t& get_default_host_alloc_desc(); @@ -71,12 +76,22 @@ struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, cc host_free_memory(static_cast<void*>(mem_handle)); } + // event pool + ccl_event_pool_ptr create_event_pool(std::initializer_list<ccl_device*> devices, + const ze_event_pool_desc_t& descr); + std::vector<std::shared_ptr<ccl_event_pool>> get_shared_event_pool( + std::initializer_list<ccl_device*> devices = {}); + std::vector<std::shared_ptr<ccl_event_pool>> get_shared_event_pool( + std::initializer_list<ccl_device*> devices = {}) const; + private: void* host_alloc_memory(size_t bytes_count, size_t alignment, const ze_host_mem_alloc_desc_t& host_desc); void host_free_memory(void* mem_handle); + + std::shared_ptr<ccl_event_pool_holder> pool_holder; }; class context_array_t { diff --git a/include/oneapi/ccl/native_device_api/l0/declarations.hpp b/include/oneapi/ccl/native_device_api/l0/declarations.hpp index 07689c7a3..68e9eaa30 100644 --- a/include/oneapi/ccl/native_device_api/l0/declarations.hpp +++ b/include/oneapi/ccl/native_device_api/l0/declarations.hpp @@ -23,6 +23,7 @@ #include "oneapi/ccl/native_device_api/l0/context.hpp" #include "oneapi/ccl/native_device_api/l0/device.hpp" +#include "oneapi/ccl/native_device_api/l0/event_pool.hpp" #include "oneapi/ccl/native_device_api/l0/subdevice.hpp" #include "oneapi/ccl/native_device_api/l0/driver.hpp" #include "oneapi/ccl/native_device_api/l0/platform.hpp" diff --git a/include/oneapi/ccl/native_device_api/l0/device.hpp b/include/oneapi/ccl/native_device_api/l0/device.hpp index 0995d3e4f..d4f7e24cc 100644 --- a/include/oneapi/ccl/native_device_api/l0/device.hpp +++ b/include/oneapi/ccl/native_device_api/l0/device.hpp @@ -50,9 +50,9 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co using const_subdevice_ptr = std::shared_ptr<const ccl_subdevice>; using sub_devices_container_type = std::map<ccl::index_type, subdevice_ptr>; - template <class elem_t> + template <class elem_t = uint8_t> using device_memory = memory<elem_t, ccl_device, ccl_context>; - template <class elem_t> + template <class elem_t = uint8_t> using device_memory_ptr = std::shared_ptr<memory<elem_t, ccl_device, ccl_context>>; using device_ipc_memory = ipc_memory<ccl_device, ccl_context>; @@ -65,7 +65,7 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co using device_cmd_list = cmd_list<ccl_device, ccl_context>; using device_module = module<ccl_device, ccl_context>; using device_module_ptr = std::shared_ptr<device_module>; - using device_event = event<ccl_device, ccl_context>; + using device_event = event; using indexed_handles = indexed_storage<handle_t>; ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx); diff --git a/include/oneapi/ccl/native_device_api/l0/event_pool.hpp b/include/oneapi/ccl/native_device_api/l0/event_pool.hpp new file mode 100644 index 000000000..343415301 --- /dev/null +++ b/include/oneapi/ccl/native_device_api/l0/event_pool.hpp @@ -0,0 +1,103 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include <atomic> +#include <mutex> //TODO use shared + +#include "oneapi/ccl/native_device_api/l0/base.hpp" +#include "oneapi/ccl/native_device_api/l0/primitives.hpp" +#include "oneapi/ccl/native_device_api/l0/utils.hpp" + +namespace native { +struct ccl_context; +struct ccl_device; +class ccl_event_pool; + +class event_pool_array_t { +public: + using value_type = std::vector<std::shared_ptr<ccl_event_pool>>; + using context_array_accessor = detail::unique_accessor<std::mutex, value_type>; + using const_context_array_accessor = detail::unique_accessor<std::mutex, const value_type>; + + context_array_accessor access(); + const_context_array_accessor access() const; + +private: + mutable std::mutex m; + value_type event_pools; +}; + +struct ccl_event_pool_holder { + ze_event_pool_handle_t get(); + std::shared_ptr<ccl_event_pool> emplace(const std::initializer_list<ccl_device*>& devices, + std::shared_ptr<ccl_event_pool> pool); + + std::vector<std::shared_ptr<ccl_event_pool>> get_event_pool_storage( + std::initializer_list<ccl_device*> devices); + std::vector<std::shared_ptr<ccl_event_pool>> get_event_pool_storage( + std::initializer_list<ccl_device*> devices) const; + + void on_delete(ze_event_pool_handle_t pool_handle, ze_context_handle_t& ctx); + +private: + mutable std::mutex m; + std::map<const ccl_device*, event_pool_array_t> contexts_pool; +}; + +class ccl_event_pool : public cl_base<ze_event_pool_handle_t, ccl_event_pool_holder, ccl_context>, + public std::enable_shared_from_this<ccl_event_pool> { +public: + using base = cl_base<ze_event_pool_handle_t, ccl_event_pool_holder, ccl_context>; + using handle_t = base::handle_t; + using base::owner_t; + using base::owner_ptr_t; + using base::context_t; + using base::context_ptr_t; + using event_ptr = std::shared_ptr<event>; + + static const ze_event_desc_t& get_default_event_desc() { + static ze_event_desc_t def = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + 0, // index + 0, // no additional memory/cache coherency required on signal + ZE_EVENT_SCOPE_FLAG_HOST // ensure memory coherency across device and Host after event completes + }; + return def; + } + + ccl_event_pool(const ze_event_pool_desc_t& descr, + handle_t h, + owner_ptr_t&& holder, + context_ptr_t&& ctx); + ~ccl_event_pool(); + + std::shared_ptr<ccl_event_pool> get_ptr() { + return this->shared_from_this(); + } + + event_ptr create_event(const ze_event_desc_t& descr = ccl_event_pool::get_default_event_desc()); + void on_delete(ze_event_handle_t event_handle, ze_context_handle_t& ctx); + + const ze_event_pool_desc_t& get_pool_description() const; + size_t get_allocated_events() const; + +private: + ze_event_pool_desc_t pool_description; + std::atomic<size_t> allocated_event_count; +}; +} // namespace native diff --git a/include/oneapi/ccl/native_device_api/l0/primitives.hpp b/include/oneapi/ccl/native_device_api/l0/primitives.hpp index a22b2f782..303e15750 100644 --- a/include/oneapi/ccl/native_device_api/l0/primitives.hpp +++ b/include/oneapi/ccl/native_device_api/l0/primitives.hpp @@ -23,6 +23,8 @@ namespace native { struct ccl_device_platform; +class ccl_event_pool; +struct ccl_context; std::string to_string(const ze_result_t result); std::string to_string(ze_memory_type_t type); @@ -68,8 +70,18 @@ template <class elem_t, struct memory; */ -template <class resource_owner, class cl_context> -using event = cl_base<ze_event_handle_t, resource_owner, cl_context>; +struct event : private cl_base<ze_event_handle_t, ccl_event_pool, ccl_context> { + using base = cl_base<ze_event_handle_t, ccl_event_pool, ccl_context>; + using base::get_owner; + using base::get_ctx; + using base::handle; + + using base::base; + + bool wait(uint64_t nanosec = std::numeric_limits<uint64_t>::max()) const; + ze_result_t status() const; + void signal(); +}; template <class elem_t, class resource_owner, class cl_context> struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*, @@ -80,6 +92,8 @@ struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t* using base::get_ctx; using base::handle; + using event_t = event; + memory(elem_t* h, size_t count, std::weak_ptr<resource_owner>&& owner, @@ -98,15 +112,16 @@ struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t* void enqueue_write_sync(const elem_t* src, int n); // async - queue_fence<resource_owner, cl_context> enqueue_write_async( - const std::vector<elem_t>& src, - queue<resource_owner, cl_context>& queue); + event_t enqueue_write_async(const std::vector<elem_t>& src, + queue<resource_owner, cl_context>& queue); + event_t enqueue_write_async(typename std::vector<elem_t>::const_iterator first, + typename std::vector<elem_t>::const_iterator last); template <int N> - queue_fence<resource_owner, cl_context> enqueue_write_async( - const std::array<elem_t, N>& src, - queue<resource_owner, cl_context>& queue); - queue_fence<resource_owner, cl_context> - enqueue_write_async(const elem_t* src, size_t n, queue<resource_owner, cl_context>& queue); + event_t enqueue_write_async(const std::array<elem_t, N>& src, + queue<resource_owner, cl_context>& queue); + event_t enqueue_write_async(const elem_t* src, + size_t n, + queue<resource_owner, cl_context>& queue); // sync memory-copy read std::vector<elem_t> enqueue_read_sync(size_t requested_size = 0) const; diff --git a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp index 0ccbf3bfb..76137fae4 100644 --- a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp +++ b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp @@ -27,6 +27,7 @@ namespace native { struct ccl_device; namespace detail { + void copy_memory_sync_unsafe(void* dst, const void* src, size_t size, @@ -37,6 +38,19 @@ void copy_memory_sync_unsafe(void* dst, size_t size, std::weak_ptr<ccl_context> ctx_weak, std::shared_ptr<ccl_context> ctx); + +event copy_memory_async_unsafe(void* dst, + const void* src, + size_t size, + std::weak_ptr<ccl_device> device_weak, + std::shared_ptr<ccl_context> ctx, + queue<ccl_device, ccl_context>& q); +event copy_memory_async_unsafe(void* dst, + const void* src, + size_t size, + std::weak_ptr<ccl_context> ctx_weak, + std::shared_ptr<ccl_context> ctx, + queue<ccl_device, ccl_context>& q); } // namespace detail template <TEMPLATE_DECL_ARG> @@ -66,7 +80,82 @@ template <TEMPLATE_DECL_ARG> size_t memory<TEMPLATE_DEF_ARG>::size() const noexcept { return count() * sizeof(elem_t); } +/* +// async operations +template <TEMPLATE_DECL_ARG> +typename memory<TEMPLATE_DEF_ARG>::event_t +memory<TEMPLATE_DEF_ARG>::enqueue_write_async( + const std::vector<elem_t>& src, + queue<resource_owner, cl_context>& queue) +{ + if (count() < src.size()) { + throw std::length_error( + std::string(__PRETTY_FUNCTION__) + + "\nCannot process 'enqueue_write_async', because memory has not enough size" + + ", expected: " + std::to_string(count()) + + ", requested: " + std::to_string(src.size())); + } + + TODO +} +template <TEMPLATE_DECL_ARG> +typename memory<TEMPLATE_DEF_ARG>::event_t +enqueue_write_async(typename std::vector<elem_t>::const_iterator first, + typename std::vector<elem_t>::const_iterator last, + queue<resource_owner, cl_context>& queue) +{ + size_t requested_size = std::distance(first, last); + if (count() < requested_size) { + throw std::length_error( + std::string(__PRETTY_FUNCTION__) + + "\nCannot process 'enqueue_write_async', because memory has not enough size" + + ", expected: " + std::to_string(count()) + + ", requested range size: " + std::to_string(requested_size)); + } + TODO +} + +template <TEMPLATE_DECL_ARG> +template <int N> +typename memory<TEMPLATE_DEF_ARG>::event_t +memory<TEMPLATE_DEF_ARG>::enqueue_write_async( + const std::array<elem_t, N>& src, + queue<resource_owner, cl_context>& queue) +{ + if (count() < N) { + throw std::length_error( + std::string(__PRETTY_FUNCTION__) + + "\nCannot process 'enqueue_write_async', because memory has not enough size" + + ", expected: " + std::to_string(count()) + + ", requested array count: " + std::to_string(N)); + } + + TODO +} + +template <TEMPLATE_DECL_ARG> +typename memory<TEMPLATE_DEF_ARG>::event_t +memory<TEMPLATE_DEF_ARG>::enqueue_write_async(const elem_t* src, size_t src_elem_count, queue<resource_owner, cl_context>& queue) +{ + if (!src) { + throw std::invalid_argument( + std::string(__PRETTY_FUNCTION__) + + "\nCannot process 'enqueue_write_async', because 'src' is 'nullptr'"); + } + + if (count() < src_elem_count) { + throw std::length_error( + std::string(__PRETTY_FUNCTION__) + + "\nCannot process 'enqueue_write_async', because memory has not enough size" + + ", expected: " + std::to_string(count()) + + ", requested c-array count: " + std::to_string(src_elem_count * sizeof(elem_t))); + } + + TODO +} +*/ +// sync operations template <TEMPLATE_DECL_ARG> void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const std::vector<elem_t>& src) { if (count() < src.size()) { @@ -135,11 +224,11 @@ void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const elem_t* src, size_t src_ "\nCannot process 'enqueue_write_sync', because 'src' is 'nullptr'"); } - if (count() < src_elem_count * sizeof(elem_t)) { + if (size() < src_elem_count * sizeof(elem_t)) { throw std::length_error( std::string(__PRETTY_FUNCTION__) + "\nCannot process 'enqueue_write_sync', because memory has not enough size" + - ", expected: " + std::to_string(count()) + + ", expected: " + std::to_string(size()) + ", requested: " + std::to_string(src_elem_count * sizeof(elem_t))); } diff --git a/include/oneapi/ccl/types_policy.hpp b/include/oneapi/ccl/types_policy.hpp index 40e3c9228..416a65dfe 100644 --- a/include/oneapi/ccl/types_policy.hpp +++ b/include/oneapi/ccl/types_policy.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include <memory> + namespace ccl { template <class impl_t> class non_copyable { diff --git a/mpi/bin/hydra_bstrap_proxy b/mpi/bin/hydra_bstrap_proxy deleted file mode 100755 index bb46998fe..000000000 Binary files a/mpi/bin/hydra_bstrap_proxy and /dev/null differ diff --git a/mpi/bin/hydra_pmi_proxy b/mpi/bin/hydra_pmi_proxy deleted file mode 100755 index 2b6be410f..000000000 Binary files a/mpi/bin/hydra_pmi_proxy and /dev/null differ diff --git a/mpi/bin/mpiexec b/mpi/bin/mpiexec deleted file mode 120000 index 482a69296..000000000 --- a/mpi/bin/mpiexec +++ /dev/null @@ -1 +0,0 @@ -mpiexec.hydra \ No newline at end of file diff --git a/mpi/bin/mpiexec.hydra b/mpi/bin/mpiexec.hydra deleted file mode 100755 index 317678932..000000000 Binary files a/mpi/bin/mpiexec.hydra and /dev/null differ diff --git a/mpi/etc/tuning_clx-ap_shm.dat b/mpi/etc/tuning_clx-ap_shm.dat deleted file mode 100755 index 35991b50a..000000000 Binary files a/mpi/etc/tuning_clx-ap_shm.dat and /dev/null differ diff --git a/mpi/lib/libmpi.so b/mpi/lib/libmpi.so deleted file mode 120000 index 9e4b9f431..000000000 --- a/mpi/lib/libmpi.so +++ /dev/null @@ -1 +0,0 @@ -libmpi.so.12.0 \ No newline at end of file diff --git a/mpi/lib/libmpi.so.12 b/mpi/lib/libmpi.so.12 deleted file mode 120000 index 5a0e391d4..000000000 --- a/mpi/lib/libmpi.so.12 +++ /dev/null @@ -1 +0,0 @@ -libmpi.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpi.so.12.0 b/mpi/lib/libmpi.so.12.0 deleted file mode 120000 index 5a0e391d4..000000000 --- a/mpi/lib/libmpi.so.12.0 +++ /dev/null @@ -1 +0,0 @@ -libmpi.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpicxx.so b/mpi/lib/libmpicxx.so deleted file mode 120000 index 9e27e2a69..000000000 --- a/mpi/lib/libmpicxx.so +++ /dev/null @@ -1 +0,0 @@ -libmpicxx.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpicxx.so.12 b/mpi/lib/libmpicxx.so.12 deleted file mode 120000 index 9e27e2a69..000000000 --- a/mpi/lib/libmpicxx.so.12 +++ /dev/null @@ -1 +0,0 @@ -libmpicxx.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpicxx.so.12.0 b/mpi/lib/libmpicxx.so.12.0 deleted file mode 120000 index 9e27e2a69..000000000 --- a/mpi/lib/libmpicxx.so.12.0 +++ /dev/null @@ -1 +0,0 @@ -libmpicxx.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpifort.so b/mpi/lib/libmpifort.so deleted file mode 120000 index 3dc64470d..000000000 --- a/mpi/lib/libmpifort.so +++ /dev/null @@ -1 +0,0 @@ -libmpifort.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpifort.so.12 b/mpi/lib/libmpifort.so.12 deleted file mode 120000 index 3dc64470d..000000000 --- a/mpi/lib/libmpifort.so.12 +++ /dev/null @@ -1 +0,0 @@ -libmpifort.so.12.0.0 \ No newline at end of file diff --git a/mpi/lib/libmpifort.so.12.0 b/mpi/lib/libmpifort.so.12.0 deleted file mode 120000 index 3dc64470d..000000000 --- a/mpi/lib/libmpifort.so.12.0 +++ /dev/null @@ -1 +0,0 @@ -libmpifort.so.12.0.0 \ No newline at end of file diff --git a/ofi/bin/fi_info b/ofi/bin/fi_info deleted file mode 100755 index 1dc3a2707..000000000 Binary files a/ofi/bin/fi_info and /dev/null differ diff --git a/ofi/lib/libfabric.so b/ofi/lib/libfabric.so deleted file mode 120000 index 878a6164e..000000000 --- a/ofi/lib/libfabric.so +++ /dev/null @@ -1 +0,0 @@ -libfabric.so.1 \ No newline at end of file diff --git a/ofi/lib/libfabric.so.1 b/ofi/lib/libfabric.so.1 deleted file mode 100755 index eea8f4068..000000000 Binary files a/ofi/lib/libfabric.so.1 and /dev/null differ diff --git a/ofi/lib/prov/libpsmx2-fi.so b/ofi/lib/prov/libpsmx2-fi.so deleted file mode 100755 index b2afe721e..000000000 Binary files a/ofi/lib/prov/libpsmx2-fi.so and /dev/null differ diff --git a/ofi/lib/prov/librxm-fi.so b/ofi/lib/prov/librxm-fi.so deleted file mode 100755 index 6fe75fd9c..000000000 Binary files a/ofi/lib/prov/librxm-fi.so and /dev/null differ diff --git a/ofi/lib/prov/libshm-fi.so b/ofi/lib/prov/libshm-fi.so deleted file mode 100755 index 71c0c931f..000000000 Binary files a/ofi/lib/prov/libshm-fi.so and /dev/null differ diff --git a/ofi/lib/prov/libsockets-fi.so b/ofi/lib/prov/libsockets-fi.so deleted file mode 100755 index d548a6979..000000000 Binary files a/ofi/lib/prov/libsockets-fi.so and /dev/null differ diff --git a/ofi/lib/prov/libtcp-fi.so b/ofi/lib/prov/libtcp-fi.so deleted file mode 100755 index f9873e173..000000000 Binary files a/ofi/lib/prov/libtcp-fi.so and /dev/null differ diff --git a/ofi/lib/prov/libverbs-fi.so b/ofi/lib/prov/libverbs-fi.so deleted file mode 100755 index 471d97946..000000000 Binary files a/ofi/lib/prov/libverbs-fi.so and /dev/null differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ecf3e257c..356e0b49a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -#builds ccl +# builds CCL set (EXTENSIONS_SRC) @@ -36,6 +36,7 @@ list (APPEND EXTENSIONS_SRC native_device_api/l0/base.cpp native_device_api/l0/device.cpp native_device_api/l0/context.cpp + native_device_api/l0/event_pool.cpp native_device_api/l0/subdevice.cpp native_device_api/l0/driver.cpp native_device_api/l0/export.cpp @@ -80,16 +81,17 @@ list (APPEND EXTENSIONS_SRC common/comm/l0/topology/ring/process_group_ring_creator.cpp common/comm/l0/topology/topology_construction_utils.cpp - common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp - common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp - common/comm/l0/context/scaling_ctx/ipc_session_key.cpp + common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp + common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp + common/comm/l0/context/scale/ipc/ipc_session_key.cpp - common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp - common/comm/l0/context/scaling_ctx/observer_session_key.cpp + common/comm/l0/context/scale/base/base_session.cpp + common/comm/l0/context/scale/scale_out/scale_out_session.cpp common/comm/l0/gpu_comm_attr.cpp common/comm/l0/modules/base_entry_module.cpp - common/comm/l0/modules/modules_source_data.cpp) + common/comm/l0/modules/modules_source_data.cpp + common/comm/l0/modules/kernel_utils.cpp) endif(MULTI_GPU_SUPPORT) set(CCL_SRC @@ -117,7 +119,6 @@ set(CCL_SRC ccl_empty_stream.cpp native_device_api/sycl_l0/export.cpp native_device_api/empty/export.cpp - atl/atl.cpp atl/atl_wrapper.cpp atl/mpi/atl_mpi.cpp atl/ofi/atl_ofi.cpp @@ -147,10 +148,12 @@ set(CCL_SRC coll/ccl_reduce_scatter_op_attr.cpp coll/ccl_sparse_allreduce_op_attr.cpp coll/ccl_barrier_op_attr.cpp + coll/coll_param.cpp coll/algorithms/allgatherv.cpp coll/algorithms/allreduce/allreduce.cpp coll/algorithms/allreduce/allreduce_2d.cpp coll/algorithms/allreduce/allreduce_rma.cpp + coll/algorithms/algorithm_utils.cpp coll/algorithms/alltoall.cpp coll/algorithms/alltoallv.cpp coll/algorithms/barrier.cpp @@ -173,19 +176,21 @@ set(CCL_SRC comp/comp.cpp comp/fp16/fp16.cpp comp/fp16/fp16_intrisics.cpp + hwloc/hwloc_wrapper.c sched/sched.cpp sched/extra_sched.cpp sched/master_sched.cpp sched/sched_base.cpp sched/cache/cache.cpp sched/cache/key.cpp + sched/queue/flow_control.cpp sched/queue/strict_queue.cpp sched/queue/queue.cpp sched/entry/coll/coll_entry.cpp sched/entry/coll/coll_entry_helper.cpp + sched/entry/copy/copy_helper.cpp sched/entry/entry.cpp sched/entry/factory/chunked_entry_factory.cpp - sched/entry/sycl_entry_helper.cpp exec/exec.cpp exec/thread/base_thread.cpp exec/thread/listener.cpp @@ -220,18 +225,21 @@ set(CCL_SRC ${EXTENSIONS_SRC}) list(APPEND CCL_INC_DIRS - ${PROJECT_SOURCE_DIR}/include - ${PROJECT_SOURCE_DIR}/mpi/include - ${LIBFABRIC_INCLUDE_DIR} - ${PROJECT_SOURCE_DIR}/src - ${PROJECT_SOURCE_DIR}/src/atl) - -message(STATUS "CCL_INC_DIRS: ${CCL_INC_DIRS}") -message(STATUS "oneCCL lib LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}") -message(STATUS "oneCCL lib LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}") - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + ${PROJECT_SOURCE_DIR}/include + ${MPI_INCLUDE_DIR} + ${LIBFABRIC_INCLUDE_DIR} + ${HWLOC_INCLUDE_DIR} + ${PROJECT_SOURCE_DIR}/src + ${PROJECT_SOURCE_DIR}/src/atl) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SRC_C_FLAGS} -pthread") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SRC_CXX_FLAGS} -pthread") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SRC_SHARED_LINKER_FLAGS}") + +message(STATUS "SRC C_FLAGS: ${CMAKE_C_FLAGS}") +message(STATUS "SRC CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +message(STATUS "SRC SHARED_LINKER_FLAGS: ${CMAKE_SHARED_LINKER_FLAGS}") +message(STATUS "SRC INC_DIRS: ${CCL_INC_DIRS}") #special library that holds objects only add_library(ccl-objects OBJECT ${CCL_SRC}) @@ -243,19 +251,27 @@ if(COMPUTE_BACKEND_TARGET_NAME) endif() # add library search directory -link_directories(${PROJECT_SOURCE_DIR}/mpi/lib) +link_directories(${MPI_LIB_DIR}) link_directories(${LIBFABRIC_LIB_DIR}) -#shared library +# shared library add_library(ccl SHARED $<TARGET_OBJECTS:ccl-objects>) target_include_directories(ccl PUBLIC ${CCL_INC_DIRS}) # link with release_mt libmpi.so for oneAPI Base toolkit -# libccl.so -> cpu_icc/cpu_gpu_dpcpp -> lib -> latest -> ccl -> mpi -> ... +# libccl.so -> cpu_icc/cpu_gpu_dpcpp -> lib -> latest -> ccl -> mpi -> ... set(ONEAPI_IMPI_RPATH "'$ORIGIN'/../../../../mpi/latest/lib/release_mt/") set_target_properties(ccl PROPERTIES LINK_FLAGS "-Wl,-rpath,${ONEAPI_IMPI_RPATH}") -target_link_libraries(ccl PUBLIC dl pthread ${EXTERNAL_LIBS} ${COMPUTE_BACKEND_TARGET_NAME} fabric mpi) +target_link_libraries(ccl PUBLIC + dl + pthread + fabric + mpi + ${HWLOC_LIB_DIR}/libhwloc.a + ${EXTERNAL_LIBS} + ${COMPUTE_BACKEND_TARGET_NAME}) + if (NOT LIB_SO_VERSION AND NOT LIB_MAJOR_VERSION) set_target_properties(ccl PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR}) else() @@ -264,39 +280,40 @@ endif() install(TARGETS ccl LIBRARY DESTINATION ${CCL_INSTALL_LIB}) install(FILES - "../cmake/FindOpenCL.cmake" - "../cmake/Findlevel_zero.cmake" - "../cmake/FindSYCL.cmake" - "../cmake/FindIntelSYCL.cmake" - "../cmake/FindIntelSYCL_level_zero.cmake" - "../cmake/FindComputeCpp.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindComputeCpp.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL_level_zero.cmake" + "${PROJECT_SOURCE_DIR}/cmake/Findlevel_zero.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindNUMA.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindOpenCL.cmake" + "${PROJECT_SOURCE_DIR}/cmake/FindSYCL.cmake" DESTINATION ${CCL_INSTALL_LIB}) -#static library +# static library add_library(ccl-static STATIC $<TARGET_OBJECTS:ccl-objects>) set_target_properties(ccl-static PROPERTIES OUTPUT_NAME ccl) set_target_properties(ccl-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR}) install(TARGETS ccl-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB} OPTIONAL) -#headers installation +# API headers install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION ${CCL_INSTALL_INCLUDE} FILES_MATCHING REGEX ".*\\.(h|hpp)$") -#mpi & ofi rt -file(GLOB mpi_bins "${PROJECT_SOURCE_DIR}/mpi/bin/*") +# MPI and OFI runtimes +file(GLOB mpi_bins "${DEPS_DIR}/mpi/bin/*") install(PROGRAMS ${mpi_bins} DESTINATION ${CCL_INSTALL_BIN}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/ofi/lib/ +install(DIRECTORY ${DEPS_DIR}/ofi/lib/ DESTINATION ${CCL_INSTALL_LIB}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/include/ +install(DIRECTORY ${DEPS_DIR}/mpi/include/ DESTINATION ${CCL_INSTALL_INCLUDE}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/lib/ +install(DIRECTORY ${DEPS_DIR}/mpi/lib/ DESTINATION ${CCL_INSTALL_LIB}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/etc/ +install(DIRECTORY ${DEPS_DIR}/mpi/etc/ DESTINATION ${CCL_INSTALL_ETC}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/licensing/ +install(DIRECTORY ${DEPS_DIR}/mpi/licensing/ DESTINATION ${CCL_INSTALL_LICENSE}/mpi/) diff --git a/src/atl/atl.cpp b/src/atl/atl.cpp deleted file mode 100644 index b43525f1f..000000000 --- a/src/atl/atl.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#include <algorithm> -#include <assert.h> -#include <cstring> -#include <dirent.h> -#include <dlfcn.h> - -#include "atl/atl.h" - -#define LIB_SUFFIX ".so" -#define ATL_LIB_PREFIX "libccl_atl_" - -static int initialized = 0; -static int should_reserve_addr = 0; - -static int atl_lib_filter(const struct dirent* entry) { - size_t entry_len = strlen(entry->d_name); - size_t sfx_len = strlen(LIB_SUFFIX); - const char* sfx_ptr; - - if (entry_len > sfx_len) { - sfx_ptr = strstr((entry->d_name), LIB_SUFFIX); - - if (strstr((entry->d_name), ATL_LIB_PREFIX) && sfx_ptr && (strlen(sfx_ptr) == sfx_len)) - return 1; - else - return 0; - } - else - return 0; -} - -static void atl_ini_dir(const char* transport_name, - int* argc, - char*** argv, - atl_attr_t* attr, - atl_ctx_t** ctx, - const char* dir, - const char* main_addr) { - CCL_THROW("unexpected path"); - - int n = 0; - char* lib; - void* dlhandle; - struct dirent** liblist = NULL; - typedef atl_status_t (*init_f)(atl_transport_t*); - init_f init_func; - size_t transport_name_len = strlen(transport_name); - - n = scandir(dir, &liblist, atl_lib_filter, NULL); - if (n < 0) - goto libdl_done; - - while (n--) { - if (asprintf(&lib, "%s/%s", dir, liblist[n]->d_name) < 0) - goto libdl_done; - - LOG_DEBUG("opening lib ", lib); - dlhandle = dlopen(lib, RTLD_NOW); - free(liblist[n]); - if (dlhandle == NULL) { - LOG_ERROR("can't open lib ", lib, ", error ", dlerror()); - free(lib); - continue; - } - - init_func = reinterpret_cast<init_f>(dlsym(dlhandle, "atl_ini")); - if (init_func == NULL) { - dlclose(dlhandle); - free(lib); - } - else { - LOG_DEBUG("lib ", lib, " contains necessary symbol"); - free(lib); - - atl_transport_t transport; - atl_status_t ret; - - if ((init_func)(&transport) != ATL_STATUS_SUCCESS) { - dlclose(dlhandle); - continue; - } - - if (strncmp(transport.name, - transport_name, - std::min(transport_name_len, strlen(transport.name)))) { - dlclose(dlhandle); - continue; - } - - if (should_reserve_addr) { - ret = transport.reserve_addr(const_cast<char*>(main_addr)); - } - else { - ret = transport.init(argc, argv, attr, ctx, main_addr, nullptr /* pmi */); - } - if (ret != ATL_STATUS_SUCCESS) { - dlclose(dlhandle); - continue; - } - - break; - } - } - -libdl_done: - while (n-- > 0) - free(liblist[n]); - free(liblist); -} - -/* - Split the given string "s" using the specified delimiter(s) in the string - "delim" and return an array of strings. The array is terminated with a NULL - pointer. Returned array should be freed with ofi_free_string_array(). - - Returns NULL on failure. - */ -static char** atl_split_and_alloc(const char* s, const char* delim, size_t* count) { - int i, n; - char* tmp; - char* dup = NULL; - char** arr = NULL; - - if (!s || !delim) - return NULL; - - dup = strdup(s); - if (!dup) - return NULL; - - /* compute the array size */ - n = 1; - for (tmp = dup; *tmp != '\0'; ++tmp) { - for (i = 0; delim[i] != '\0'; ++i) { - if (*tmp == delim[i]) { - ++n; - break; - } - } - } - - /* +1 to leave space for NULL terminating pointer */ - arr = static_cast<char**>(calloc(n + 1, sizeof(*arr))); - if (!arr) - goto cleanup; - - /* set array elts to point inside the dup'ed string */ - for (tmp = dup, i = 0; tmp != NULL; ++i) - arr[i] = strsep(&tmp, delim); - - assert(i == n); - - if (count) - *count = n; - - return arr; - -cleanup: - free(dup); - return NULL; -} - -/* see atl_split_and_alloc() */ -static void atl_free_string_array(char** s) { - /* all strings are allocated from the same strdup'ed slab, so just free - * the first element */ - if (s != NULL) - free(s[0]); - - /* and then the actual array of pointers */ - free(s); -} - -atl_status_t atl_init(const char* transport_name, - int* argc, - char*** argv, - atl_attr_t* attr, - atl_ctx_t** ctx, - const char* main_addr) { - CCL_THROW("unexpected path"); - - const char* transport_dl_dir = NULL; - int n = 0; - char** dirs; - void* dlhandle; - - if (initialized) - return ATL_STATUS_FAILURE; - - dlhandle = dlopen(NULL, RTLD_NOW); - if (dlhandle == NULL) - goto err_dlopen; - - dlclose(dlhandle); - - dirs = atl_split_and_alloc(transport_dl_dir, ":", NULL); - if (dirs) { - for (n = 0; dirs[n]; ++n) { - atl_ini_dir(transport_name, argc, argv, attr, ctx, dirs[n], main_addr); - } - atl_free_string_array(dirs); - } - - return ATL_STATUS_SUCCESS; - -err_dlopen: - return ATL_STATUS_FAILURE; -} - -void atl_main_addr_reserve(char* main_addr) { - CCL_THROW("unexpected path"); - should_reserve_addr = 1; - atl_init("ofi", NULL, NULL, NULL, NULL, main_addr); - should_reserve_addr = 0; -} diff --git a/src/atl/atl.h b/src/atl/atl.h index 66895cd61..c515fa483 100644 --- a/src/atl/atl.h +++ b/src/atl/atl.h @@ -30,7 +30,7 @@ class iatl { virtual atl_status_t atl_init(int* argc, char*** argv, - atl_attr_t* att, + atl_attr_t* attr, const char* main_addr, std::unique_ptr<ipmi>& pmi) = 0; diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h index 8675046c1..5fd6ab20c 100644 --- a/src/atl/atl_def.h +++ b/src/atl/atl_def.h @@ -48,6 +48,15 @@ #define ATL_OFI_INI ATL_EXT_INI #define ATL_MPI_INI ATL_EXT_INI +#define ATL_CALL(func, err_action) \ + do { \ + atl_status_t status = func; \ + if (status != FI_SUCCESS) { \ + LOG_ERROR(#func "\n fails with status: ", status); \ + err_action; \ + } \ + } while (0) + class ipmi; typedef struct atl_ctx atl_ctx_t; @@ -100,15 +109,29 @@ typedef enum { ATL_REDUCTION_CUSTOM } atl_reduction_t; +typedef enum { ATL_MNIC_NONE, ATL_MNIC_LOCAL, ATL_MNIC_GLOBAL } atl_mnic_t; + typedef struct { - size_t ep_count; - int enable_shm; - size_t tag_bits; - uint64_t max_tag; - int enable_rma; - size_t max_order_waw_size; - int sync_coll; - int extra_ep; + struct { + int enable_shm; + int enable_rma; + int enable_device_buf; + int enable_sync_coll; + int enable_extra_ep; + size_t ep_count; + atl_mnic_t mnic_type; + size_t mnic_count; + } in; + struct { + int enable_shm; + int enable_rma; + int enable_device_buf; + atl_mnic_t mnic_type; + size_t mnic_count; + size_t tag_bits; + uint64_t max_tag; + size_t max_order_waw_size; + } out; } atl_attr_t; typedef struct { diff --git a/src/atl/atl_wrapper.cpp b/src/atl/atl_wrapper.cpp index 5ceaea8f0..fc1bed457 100644 --- a/src/atl/atl_wrapper.cpp +++ b/src/atl/atl_wrapper.cpp @@ -28,14 +28,20 @@ static std::list<std::shared_ptr<iatl>> transports{}; static ccl_executor* executor; atl_attr_t atl_wrapper::attr = { - 1, /* ep_count */ - 1, /* enable_shm */ - 64, /* tag_bits */ - 0xFFFFFFFFFFFFFFFF, /* max_tag */ - 0, /* enable_rma */ - 0, /* max_order_waw_size */ - 0, /* sync_coll */ - 0 /* extra_ep */ + /* in */ + { + 0, /* enable_shm */ + 0, /* enable_rma */ + 0, /* enable_device_buf */ + 0, /* enable_sync_coll */ + 0, /* enable_extra_ep */ + 1, /* ep_count */ + ATL_MNIC_NONE, /* mnic_type */ + 1 /* mnic_count */ + }, + + /* out */ + {} }; void atl_wrapper::set_internal_env(const atl_attr_t& attr) { @@ -149,7 +155,7 @@ atl_wrapper::atl_wrapper(int total_rank_count, init_transport(); } void atl_wrapper::init_transport() { - LOG_DEBUG("init ATL, requested ep_count ", attr.ep_count); + LOG_DEBUG("init ATL, requested ep_count ", attr.in.ep_count); static std::mutex memory_mutex; { std::lock_guard<std::mutex> lock(memory_mutex); @@ -160,7 +166,7 @@ void atl_wrapper::init_transport() { } } eps = transport->atl_get_eps(); - tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.tag_bits, attr.max_tag)); + tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag)); if (pmi) { threads_per_process = pmi->get_threads_per_process(); @@ -177,13 +183,25 @@ void atl_wrapper::init_transport() { if (rank == 0) { tag->print(); - LOG_INFO("atl-parameters:"); - LOG_INFO(" ep_count: ", attr.ep_count); - LOG_INFO(" enable_shm: ", attr.enable_shm); - LOG_INFO(" enable_rma: ", attr.enable_rma); - LOG_INFO(" max_order_waw_size: ", attr.max_order_waw_size); - LOG_INFO(" sync_coll: ", attr.sync_coll); - LOG_INFO(" extra_ep: ", attr.extra_ep); + LOG_INFO("atl-in-attrs:"); + LOG_INFO(" enable_shm: ", attr.in.enable_shm); + LOG_INFO(" enable_rma: ", attr.in.enable_rma); + LOG_INFO(" enable_device_buf: ", attr.in.enable_device_buf); + LOG_INFO(" enable_sync_coll: ", attr.in.enable_sync_coll); + LOG_INFO(" enable_extra_ep: ", attr.in.enable_extra_ep); + LOG_INFO(" ep_count: ", attr.in.ep_count); + LOG_INFO(" mnic_type: ", attr.in.mnic_type); + LOG_INFO(" mnic_count: ", attr.in.mnic_count); + + LOG_INFO("atl-out-attrs:"); + LOG_INFO(" enable_shm: ", attr.out.enable_shm); + LOG_INFO(" enable_rma: ", attr.out.enable_rma); + LOG_INFO(" enable_device_buf: ", attr.out.enable_device_buf); + LOG_INFO(" mnic_type: ", attr.out.mnic_type); + LOG_INFO(" mnic_count: ", attr.out.mnic_count); + LOG_INFO(" tag_bits: ", attr.out.tag_bits); + LOG_INFO(" max_tag: ", attr.out.max_tag); + LOG_INFO(" max_order_waw_size: ", attr.out.max_order_waw_size); } if ((!pmi) || (pmi && pmi->get_local_thread_idx() == 0)) { diff --git a/src/atl/atl_wrapper.h b/src/atl/atl_wrapper.h index aee1ad744..5edad0aa3 100644 --- a/src/atl/atl_wrapper.h +++ b/src/atl/atl_wrapper.h @@ -40,14 +40,6 @@ class atl_wrapper { const std::vector<int>& ranks, std::shared_ptr<ikvs_wrapper> k); - // atl_status_t - // atl_init(int* argc, char*** argv, - // atl_attr_t* att, - // const char* main_addr) - // { - // return transport->atl_init(argc, argv, att, main_addr, pmi); - // } - atl_status_t atl_main_addr_reserve(char* main_addr) { if (!pmi) return ATL_STATUS_UNSUPPORTED; diff --git a/src/atl/mpi/atl_mpi.hpp b/src/atl/mpi/atl_mpi.hpp index ab02a5da8..1a98e3419 100644 --- a/src/atl/mpi/atl_mpi.hpp +++ b/src/atl/mpi/atl_mpi.hpp @@ -24,7 +24,7 @@ class atl_mpi final : public iatl { atl_status_t atl_init(int* argc, char*** argv, - atl_attr_t* att, + atl_attr_t* attr, const char* main_addr, std::unique_ptr<ipmi>& pmi) override; diff --git a/src/atl/mpi/atl_mpi_impl.cpp b/src/atl/mpi/atl_mpi_impl.cpp index 9a312d514..923d538bb 100644 --- a/src/atl/mpi/atl_mpi_impl.cpp +++ b/src/atl/mpi/atl_mpi_impl.cpp @@ -33,16 +33,22 @@ #define ATL_MPI_PM_KEY "atl-mpi" -#define EP_IDX_KEY "ep_idx" -#define NIC_IDX_KEY "pref_nic" -#define NIC_COUNT_KEY "num_nics" -#define CLOSE_NIC_IDX_KEY "pref_close_nic" -#define CLOSE_NIC_COUNT_KEY "num_close_nics" +#define EP_IDX_KEY "ep_idx" + +#define GLOBAL_NIC_IDX_KEY "pref_nic" +#define GLOBAL_NIC_COUNT_KEY "num_nics" +#define LOCAL_NIC_IDX_KEY "pref_close_nic" +#define LOCAL_NIC_COUNT_KEY "num_close_nics" #define RET2ATL(ret) (ret != MPI_SUCCESS) ? ATL_STATUS_FAILURE : ATL_STATUS_SUCCESS typedef enum { ATL_MPI_LIB_IMPI, ATL_MPI_LIB_MPICH, ATL_MPI_LIB_NONE } atl_mpi_lib_type_t; +typedef struct { + atl_mpi_lib_type_t type; + int device_buf; +} atl_mpi_lib_attr_t; + typedef struct { atl_mpi_lib_type_t type; const char* name; @@ -56,6 +62,9 @@ typedef struct { /* minimal expected version of library, mandatory */ int min_version_value; + /* minimal expected version of library with device_buf support, mandatory */ + int min_device_buf_version_value; + /* string prefix before library kind, optional */ const char* kind_prefix; @@ -66,9 +75,16 @@ typedef struct { #define MPI_LIB_INFO_MAX_COUNT 3 static atl_mpi_lib_info_t mpi_lib_infos[MPI_LIB_INFO_MAX_COUNT] = { - { ATL_MPI_LIB_IMPI, "impi", "Intel(R) MPI Library", NULL, 2019, "library kind:", "release_mt" }, - { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, NULL, NULL }, - { ATL_MPI_LIB_NONE, "none", "", NULL, 0, NULL, NULL }, + { ATL_MPI_LIB_IMPI, + "impi", + "Intel(R) MPI Library", + NULL, + 2019, + 2021, + "library kind:", + "release_mt" }, + { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, -1, NULL, NULL }, + { ATL_MPI_LIB_NONE, "none", "", NULL, 0, -1, NULL, NULL }, }; #ifdef CCL_BF16_COMPILER @@ -102,20 +118,22 @@ typedef struct { typedef struct atl_mpi_global_data { int is_external_init; size_t ctx_count; - atl_mpi_lib_type_t mpi_lib_type; int extra_ep; - size_t nic_count; - size_t close_nic_count; + atl_mnic_t mnic_type; + size_t mnic_count; + atl_mpi_lib_attr_t mpi_lib_attr; atl_mpi_bf16_data_t bf16; atl_mpi_fp16_data_t fp16; atl_mpi_global_data() : is_external_init(0), ctx_count(0), - mpi_lib_type(ATL_MPI_LIB_NONE), extra_ep(0), - nic_count(1), - close_nic_count(1) { + mnic_type(ATL_MNIC_NONE), + mnic_count(1) { + mpi_lib_attr.type = ATL_MPI_LIB_NONE; + mpi_lib_attr.device_buf = 0; + bf16.dtype = MPI_DATATYPE_NULL; bf16.sum_op = MPI_OP_NULL; bf16.prod_op = MPI_OP_NULL; @@ -536,8 +554,9 @@ static MPI_Op atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype) { } } -atl_mpi_lib_type_t atl_mpi_get_lib_type() { - atl_mpi_lib_type_t lib_type = ATL_MPI_LIB_NONE; +atl_mpi_lib_attr_t atl_mpi_get_lib_attr() { + atl_mpi_lib_attr_t lib_attr = { ATL_MPI_LIB_NONE, 0 }; + char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING] = { 0 }; int mpi_version_len = -1, i; atl_mpi_lib_info_t* final_info = NULL; @@ -548,7 +567,7 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() { if ((ret != MPI_SUCCESS) || (mpi_version_len < 0) || (mpi_version_len > MPI_MAX_LIBRARY_VERSION_STRING)) { LOG_WARN("can not retrieve MPI version, mpi_version_len ", mpi_version_len, ", ret", ret); - return ATL_MPI_LIB_NONE; + return lib_attr; } /* remove trailing spaces at the end for more compact log */ @@ -557,12 +576,25 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() { LOG_DEBUG("MPI version: ", mpi_version); + /* for filtering */ + char* lib_type_env = getenv("CCL_ATL_MPI"); + for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) { atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]); if (info->type == ATL_MPI_LIB_NONE) continue; + if (lib_type_env) { + if (strcmp(lib_type_env, info->name)) { + LOG_DEBUG("library ", info->name, " is filtered out by user input ", lib_type_env); + continue; + } + else { + LOG_DEBUG("use lib_type = ", lib_type_env, " because it is requested explicitly"); + } + } + CCL_THROW_IF_NOT(info->version_prefix_1, "empty version_prefix_1"); CCL_THROW_IF_NOT(info->min_version_value >= 0, "unexpected minimal version"); @@ -628,7 +660,6 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() { " (min version) ", (info->kind_value ? info->kind_value : ""), "\n"); - continue; } } else { @@ -649,50 +680,49 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() { version_value, ") is higher or equal to minimal expected version (", info->min_version_value, - ") " - "and kind matches with expected kind"); - break; - } - } + ")"); - /* user input has higher priority */ - char* lib_type_env = NULL; - if ((lib_type_env = getenv("CCL_ATL_MPI")) != NULL) { - final_info = NULL; - for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) { - atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]); + lib_attr.type = final_info->type; + lib_attr.device_buf = + (final_info->min_device_buf_version_value >= version_value) ? 1 : 0; - if (!strcmp(lib_type_env, info->name)) { - final_info = info; - LOG_DEBUG("set lib_type = ", lib_type_env, " because it is requested explicitly"); - break; - } + break; } } if (final_info) { LOG_DEBUG("MPI library type: ", final_info->name); - lib_type = final_info->type; } else { LOG_DEBUG("MPI library type: none"); - lib_type = ATL_MPI_LIB_NONE; } - return lib_type; + return lib_attr; } size_t atl_mpi_get_ep_count(const atl_attr_t& attr) { - size_t mpi_ep_count = attr.ep_count; - if (attr.extra_ep) - mpi_ep_count += attr.extra_ep; + size_t mpi_ep_count = attr.in.ep_count; + if (attr.in.enable_extra_ep) + mpi_ep_count += attr.in.enable_extra_ep; return mpi_ep_count; } +size_t atl_mpi_get_ep_idx(size_t ep_idx) { + size_t mpi_ep_idx = ep_idx; + if (global_data.extra_ep) + mpi_ep_idx += global_data.extra_ep; + return mpi_ep_idx; +} + /* set these knobs without detection of MPI library type */ atl_status_t atl_mpi_set_base_env(const atl_attr_t& attr) { setenv("PSM2_MULTI_EP", "1", 0); setenv("FI_OFI_RXM_USE_HASH", "0", 0); + +#ifdef CCL_ENABLE_SYCL + setenv("FI_SHM_DISABLE_CMA", "1", 0); +#endif /* CCL_ENABLE_SYCL */ + setenv("MPIR_CVAR_DEFAULT_THREAD_LEVEL", "MPI_THREAD_MULTIPLE", 0); /* request IMPI level append library kind into MPI_Get_library_version output */ @@ -701,24 +731,34 @@ atl_status_t atl_mpi_set_base_env(const atl_attr_t& attr) { return ATL_STATUS_SUCCESS; } -atl_status_t atl_mpi_set_impi_env(const atl_attr_t& attr) { +atl_status_t atl_mpi_set_impi_env(const atl_attr_t& attr, const atl_mpi_lib_attr_t& lib_attr) { char ep_count_str[MPI_MAX_INFO_VAL] = { 0 }; snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", atl_mpi_get_ep_count(attr)); + if (attr.in.ep_count) + setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0); + +#ifdef CCL_ENABLE_SYCL + setenv("I_MPI_SHM_CMA", "0", 0); + if (attr.in.enable_device_buf && lib_attr.device_buf) { + setenv("I_MPI_OFFLOAD", "2", 0); + setenv("I_MPI_OFFLOAD_TOPOLIB", "l0", 0); + setenv("I_MPI_OFFLOAD_QUEUE_CACHE", "1", 0); + setenv("I_MPI_OFFLOAD_LIST_CACHE", "1", 0); + if (attr.in.ep_count > 1) { + /* try to set global lock level before vci level + because setenv is invoked with overwrite=0 */ + setenv("I_MPI_THREAD_LOCK_LEVEL", "global", 0); + } + } +#endif /* CCL_ENABLE_SYCL */ + setenv("I_MPI_THREAD_SPLIT", "1", 0); setenv("I_MPI_THREAD_RUNTIME", "generic", 0); setenv("I_MPI_THREAD_MAX", ep_count_str, 0); setenv("I_MPI_THREAD_ID_KEY", EP_IDX_KEY, 0); setenv("I_MPI_THREAD_LOCK_LEVEL", "vci", 0); - if (attr.ep_count) - setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0); - - auto& env = ccl::global_data::env(); - if (env.log_level >= ccl_log_level::info) { - setenv("I_MPI_DEBUG", "4", 0); - } - return ATL_STATUS_SUCCESS; } @@ -756,18 +796,20 @@ atl_status_t atl_mpi_set_mpich_env(const atl_attr_t& attr) { setenv("MPIR_CVAR_CH4_OFI_MAX_VCIS", ep_count_str, 0); setenv("MPIR_CVAR_CH4_ASYNC_PROGRESS_ID_KEY", EP_IDX_KEY, 0); setenv("MPIR_CVAR_CH4_OFI_ENABLE_SCALABLE_ENDPOINTS", "1", 0); - setenv("MPIR_CVAR_CH4_OFI_ENABLE_NIC_SELECTION", "1", 0); + + if (attr.in.mnic_type != ATL_MNIC_NONE) { + setenv("MPIR_CVAR_CH4_OFI_ENABLE_NIC_SELECTION", "1", 0); + auto& env = ccl::global_data::env(); + if (env.log_level >= ccl_log_level::info) { + setenv("MPIR_CVAR_CH4_OFI_DUMP_NIC_SETTINGS", "1", 0); + } + } setenv("FI_PSM2_DELAY", "0", 0); setenv("FI_PSM2_TIMEOUT", "0", 0); setenv("FI_PSM2_NAME_SERVER", "0", 0); setenv("HFI_NO_CPUAFFINITY", "1", 0); - auto& env = ccl::global_data::env(); - if (env.log_level >= ccl_log_level::info) { - setenv("MPIR_CVAR_CH4_OFI_DUMP_NIC_SETTINGS", "1", 0); - } - return ATL_STATUS_SUCCESS; } @@ -781,12 +823,12 @@ atl_status_t atl_mpi_check_mpich_env(const atl_attr_t& attr) { } atl_status_t atl_mpi_set_env(const atl_attr_t& attr) { - if (global_data.mpi_lib_type != ATL_MPI_LIB_NONE) { + if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_NONE) { /* library type was already detected and env was set, make sanity check */ - if (global_data.mpi_lib_type == ATL_MPI_LIB_IMPI) { + if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_IMPI) { return atl_mpi_check_impi_env(attr); } - else if (global_data.mpi_lib_type == ATL_MPI_LIB_MPICH) { + else if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_MPICH) { return atl_mpi_check_mpich_env(attr); } return ATL_STATUS_SUCCESS; @@ -794,18 +836,17 @@ atl_status_t atl_mpi_set_env(const atl_attr_t& attr) { atl_mpi_set_base_env(attr); - atl_mpi_lib_type_t type = atl_mpi_get_lib_type(); + atl_mpi_lib_attr_t mpi_lib_attr = atl_mpi_get_lib_attr(); - if (type == ATL_MPI_LIB_NONE) { - /* nothing to do */ + if (mpi_lib_attr.type == ATL_MPI_LIB_NONE) { return ATL_STATUS_SUCCESS; } - if (type == ATL_MPI_LIB_IMPI) { - atl_mpi_set_impi_env(attr); + if (mpi_lib_attr.type == ATL_MPI_LIB_IMPI) { + atl_mpi_set_impi_env(attr, mpi_lib_attr); atl_mpi_check_impi_env(attr); } - else if (type == ATL_MPI_LIB_MPICH) { + else if (mpi_lib_attr.type == ATL_MPI_LIB_MPICH) { atl_mpi_set_mpich_env(attr); atl_mpi_check_mpich_env(attr); } @@ -819,7 +860,7 @@ atl_status_t atl_mpi_set_env(const atl_attr_t& attr) { LOG_DEBUG("set CCL-MPI specific environment"); } - global_data.mpi_lib_type = type; + global_data.mpi_lib_attr = mpi_lib_attr; return ATL_STATUS_SUCCESS; } @@ -838,9 +879,6 @@ atl_mpi_comm_info_t atl_mpi_get_comm_info(MPI_Comm comm, const char* key) { } size_t atl_mpi_get_nic_count(const char* nic_count_key) { - if (global_data.mpi_lib_type != ATL_MPI_LIB_MPICH) - return 1; - size_t count = 1; atl_mpi_comm_info_t info = atl_mpi_get_comm_info(MPI_COMM_WORLD, nic_count_key); CCL_THROW_IF_NOT(info.found, "MPI comm key ", nic_count_key, " was not set"); @@ -867,7 +905,7 @@ void atl_mpi_check_comm_info(MPI_Comm comm, const char* key, const char* expecte } void atl_mpi_check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) { - if (global_data.mpi_lib_type == ATL_MPI_LIB_NONE) + if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE) return; char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 }; @@ -876,9 +914,6 @@ void atl_mpi_check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) { } void atl_mpi_check_comm_nic_idx(MPI_Comm comm, size_t expected_idx, const char* nic_idx_key) { - if (global_data.mpi_lib_type != ATL_MPI_LIB_MPICH) - return; - char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 }; snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx); atl_mpi_check_comm_info(comm, nic_idx_key, expected_idx_str); @@ -887,7 +922,7 @@ void atl_mpi_check_comm_nic_idx(MPI_Comm comm, size_t expected_idx, const char* #ifdef ENABLE_DEBUG inline void atl_mpi_check_ep(atl_ep_t* ep) { atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep); - atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, ep->idx); + atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, atl_mpi_get_ep_idx(ep->idx)); } #else #define atl_mpi_check_ep(ep) @@ -1090,6 +1125,7 @@ static atl_status_t atl_mpi_ep_allreduce(atl_ep_t* ep, mpi_req->native_req = MPI_REQUEST_NULL; } else { + //printf("atl_mpi: send_buf %p, recv_buf %p\n", send_buf, recv_buf); ret = MPI_Iallreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf, recv_buf, count, @@ -1441,13 +1477,14 @@ static atl_comp_ops_t atl_mpi_ep_comp_ops = { .wait = atl_mpi_ep_wait, static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t** ep) { int ret; - ssize_t mpi_ep_idx = idx; - /* select NIC index from local NICs only */ - size_t nic_idx = (idx % global_data.close_nic_count); + ssize_t mpi_ep_idx = atl_mpi_get_ep_idx(idx); + char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 }; + size_t nic_idx = 0; char nic_idx_str[MPI_MAX_INFO_VAL] = { 0 }; - char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 }; + const char* nic_idx_key = + (global_data.mnic_type == ATL_MNIC_GLOBAL) ? GLOBAL_NIC_IDX_KEY : LOCAL_NIC_IDX_KEY; atl_mpi_ep_t* mpi_ep = (atl_mpi_ep_t*)calloc(1, sizeof(atl_mpi_ep_t)); if (!mpi_ep) @@ -1460,16 +1497,17 @@ static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t MPI_Info info; MPI_Info_create(&info); - /* set NIC index */ - snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx); - MPI_Info_set(info, CLOSE_NIC_IDX_KEY, nic_idx_str); - /* set EP index */ - if (global_data.extra_ep) - mpi_ep_idx += global_data.extra_ep; snprintf(mpi_ep_idx_str, MPI_MAX_INFO_VAL, "%zu", mpi_ep_idx); MPI_Info_set(info, EP_IDX_KEY, mpi_ep_idx_str); + if (global_data.mnic_type != ATL_MNIC_NONE) { + /* set NIC index */ + nic_idx = (idx % global_data.mnic_count); + snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx); + MPI_Info_set(info, nic_idx_key, nic_idx_str); + } + MPI_Comm_set_info(mpi_ep->mpi_comm, info); if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) { @@ -1479,14 +1517,18 @@ static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t MPI_Comm_set_info(mpi_ep->dummy_comm, info); MPI_Irecv(NULL, 0, MPI_CHAR, 0, 0, mpi_ep->dummy_comm, &(mpi_ep->dummy_req.native_req)); - atl_mpi_check_comm_nic_idx(mpi_ep->dummy_comm, nic_idx, CLOSE_NIC_IDX_KEY); atl_mpi_check_comm_ep_idx(mpi_ep->dummy_comm, mpi_ep_idx); + if (global_data.mnic_type != ATL_MNIC_NONE) { + atl_mpi_check_comm_nic_idx(mpi_ep->dummy_comm, nic_idx, nic_idx_key); + } } MPI_Info_free(&info); - atl_mpi_check_comm_nic_idx(mpi_ep->mpi_comm, nic_idx, CLOSE_NIC_IDX_KEY); atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, mpi_ep_idx); + if (global_data.mnic_type != ATL_MNIC_NONE) { + atl_mpi_check_comm_nic_idx(mpi_ep->mpi_comm, nic_idx, nic_idx_key); + } LOG_DEBUG("atl-mpi-ep: ", idx, ", ep_idx ", mpi_ep_idx, ", nic_idx ", nic_idx); @@ -1541,7 +1583,7 @@ static atl_status_t atl_mpi_init(int* argc, if (!global_data.is_external_init) { ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level); if (provided_thread_level < required_thread_level) { - LOG_ERROR("unexpected MPI thread level: requested ", + LOG_ERROR("unexpected MPI thread level: required ", required_thread_level, ", provided ", provided_thread_level); @@ -1552,23 +1594,40 @@ static atl_status_t atl_mpi_init(int* argc, LOG_DEBUG("MPI was initialized externaly"); MPI_Query_thread(&provided_thread_level); if (provided_thread_level < required_thread_level) { - LOG_ERROR("MPI was initialized externaly but with unexpected thread level: " - "requested ", - required_thread_level, - ", provided ", - provided_thread_level); - goto err_init; + LOG_WARN("MPI was initialized externaly but with unexpected thread level: " + "required ", + required_thread_level, + ", provided ", + provided_thread_level); } } if (ret) goto err_init; - if (global_data.mpi_lib_type == ATL_MPI_LIB_NONE) - global_data.mpi_lib_type = atl_mpi_get_lib_type(); - global_data.extra_ep = attr->extra_ep; - global_data.nic_count = atl_mpi_get_nic_count(NIC_COUNT_KEY); - global_data.close_nic_count = atl_mpi_get_nic_count(CLOSE_NIC_COUNT_KEY); + if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE) + global_data.mpi_lib_attr = atl_mpi_get_lib_attr(); + + global_data.extra_ep = attr->in.enable_extra_ep; + + global_data.mnic_type = attr->in.mnic_type; + if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_MPICH) { + /* only MPICH supports multi-NIC */ + global_data.mnic_type = ATL_MNIC_NONE; + } + + if (global_data.mnic_type == ATL_MNIC_LOCAL) { + global_data.mnic_count = atl_mpi_get_nic_count(LOCAL_NIC_COUNT_KEY); + } + else if (global_data.mnic_type == ATL_MNIC_GLOBAL) { + global_data.mnic_count = atl_mpi_get_nic_count(GLOBAL_NIC_IDX_KEY); + } + else if (global_data.mnic_type == ATL_MNIC_NONE) { + global_data.mnic_count = 1; + } + global_data.mnic_count = std::min(global_data.mnic_count, attr->in.mnic_count); + global_data.mnic_count = std::min(global_data.mnic_count, attr->in.ep_count); + global_data.mnic_count = std::max(global_data.mnic_count, (size_t)(1)); if (atl_mpi_bf16_init() == ATL_STATUS_FAILURE) { atl_mpi_bf16_finalize(); @@ -1597,8 +1656,8 @@ static atl_status_t atl_mpi_init(int* argc, ctx->ops = &atl_mpi_ops; ctx->mr_ops = &atl_mpi_mr_ops; - ctx->ep_count = attr->ep_count; - ctx->eps = (atl_ep_t**)calloc(1, sizeof(void*) * attr->ep_count); + ctx->ep_count = attr->in.ep_count; + ctx->eps = (atl_ep_t**)calloc(1, sizeof(void*) * attr->in.ep_count); if (!ctx->eps) goto err_after_init; @@ -1610,23 +1669,25 @@ static atl_status_t atl_mpi_init(int* argc, else { mpi_ctx->progress_mode = ATL_PROGRESS_CHECK; } - mpi_ctx->sync_coll = attr->sync_coll; + mpi_ctx->sync_coll = attr->in.enable_sync_coll; if (coord->global_idx == 0) { if (global_data.ctx_count == 1) { LOG_INFO("atl-mpi-global:") LOG_INFO(" is_external_init: ", global_data.is_external_init); - LOG_INFO(" mpi_lib_type: ", mpi_lib_infos[global_data.mpi_lib_type].name); + LOG_INFO(" mpi_lib_attr.type: ", mpi_lib_infos[global_data.mpi_lib_attr.type].name); + LOG_INFO(" mpi_lib_attr.device_buf: ", global_data.mpi_lib_attr.device_buf); LOG_INFO(" extra_ep: ", global_data.extra_ep); - LOG_INFO(" nic_count: ", global_data.nic_count); - LOG_INFO(" close_nic_count: ", global_data.close_nic_count); + LOG_INFO(" mnic_type: ", global_data.mnic_type); + if (global_data.mnic_type != ATL_MNIC_NONE) + LOG_INFO(" mnic_count: ", global_data.mnic_count); } LOG_INFO("atl-mpi-ctx: ", (global_data.ctx_count - 1)); LOG_INFO(" progress_mode: ", mpi_ctx->progress_mode); LOG_INFO(" sync_coll: ", mpi_ctx->sync_coll); } - for (i = 0; i < attr->ep_count; i++) { + for (i = 0; i < attr->in.ep_count; i++) { ret = atl_mpi_ep_init(mpi_ctx, i, &(ctx->eps[i])); if (ret) goto err_ep_dup; @@ -1636,15 +1697,20 @@ static atl_status_t atl_mpi_init(int* argc, MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_ptr, &is_tag_ub_set); - attr->tag_bits = 32; - attr->max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0; - attr->enable_rma = 0; - attr->max_order_waw_size = 0; + /* report actual attributes back to upper level */ + attr->out.enable_shm = 0; + attr->out.enable_rma = 0; + attr->out.enable_device_buf = attr->in.enable_device_buf & global_data.mpi_lib_attr.device_buf; + attr->out.mnic_type = global_data.mnic_type; + attr->out.mnic_count = global_data.mnic_count; + attr->out.tag_bits = 32; + attr->out.max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0; + attr->out.max_order_waw_size = 0; return ATL_STATUS_SUCCESS; err_ep_dup: - for (i = 0; i < attr->ep_count; i++) { + for (i = 0; i < attr->in.ep_count; i++) { atl_mpi_ep_t* mpi_ep = container_of(ctx->eps[i], atl_mpi_ep_t, ep); if (ctx->eps[i] && mpi_ep) { diff --git a/src/atl/ofi/atl_ofi_impl.cpp b/src/atl/ofi/atl_ofi_impl.cpp index 77f8391d5..ea34fef84 100644 --- a/src/atl/ofi/atl_ofi_impl.cpp +++ b/src/atl/ofi/atl_ofi_impl.cpp @@ -14,12 +14,14 @@ limitations under the License. */ #include <assert.h> +#include <dlfcn.h> #include <inttypes.h> #include <math.h> #include <rdma/fabric.h> #include <rdma/fi_cm.h> #include <rdma/fi_tagged.h> #include <rdma/fi_rma.h> +#include <sstream> #include <stdio.h> #include <stdint.h> #include <stdlib.h> @@ -30,23 +32,30 @@ #include <errno.h> #include "atl.h" +#include "hwloc/hwloc_wrapper.h" + +#define ATL_OFI_BASE_PM_KEY "atl-ofi" +#define ATL_OFI_FI_ADDR_PM_KEY ATL_OFI_BASE_PM_KEY "-fiaddr" +#define ATL_OFI_HOSTNAME_PM_KEY ATL_OFI_BASE_PM_KEY "-hostname" -#define ATL_OFI_BASE_PM_KEY "atl-ofi" -#define ATL_OFI_FI_ADDR_PM_KEY ATL_OFI_BASE_PM_KEY "-fiaddr" -#define ATL_OFI_HOSTNAME_PM_KEY ATL_OFI_BASE_PM_KEY "-hostname" #define ATL_OFI_TIMEOUT_SEC_ENV "ATL_OFI_TIMEOUT_SEC" #define ATL_OFI_MAX_RETRY_COUNT_ENV "ATL_OFI_MAX_RETRY_COUNT" + #define ATL_OFI_DEFAULT_TIMEOUT_SEC 60 #define ATL_OFI_MAX_RETRY_COUNT 10000 #define ATL_OFI_MAX_HOSTNAME_LEN 64 #define ATL_OFI_WAIT_SEC 10 #define ATL_OFI_CQ_READ_ITERS 10000 #define ATL_OFI_CQ_BUNCH_SIZE 8 + #define ATL_OFI_MAX_PROV_ENV_LEN 128 #define ATL_OFI_PMI_PROV_MULTIPLIER 100 #define ATL_OFI_PMI_PROC_MULTIPLIER (ATL_OFI_PMI_PROV_MULTIPLIER * 10) -#define ATL_OFI_MAX_PROV_COUNT 2 /* NW and SHM providers */ -#define ATL_OFI_SHM_PROV_NAME "shm" +#define ATL_OFI_MAX_NW_PROV_COUNT 32 +#define ATL_OFI_MAX_PROV_COUNT (ATL_OFI_MAX_NW_PROV_COUNT + 1) /* NW and SHM providers */ +#define ATL_OFI_MAX_ACTIVE_PROV_COUNT \ + 2 /* by current scheme each EP may use only SHM and 1 NW prov */ +#define ATL_OFI_SHM_PROV_NAME "shm" #ifndef PRId64 #define PRId64 "lld" @@ -150,6 +159,7 @@ typedef struct { } atl_ofi_prov_ep_t; typedef struct { + size_t idx; struct fi_info* info; struct fid_fabric* fabric; struct fid_domain* domain; @@ -167,11 +177,15 @@ typedef struct { fi_addr_t* addr_table; size_t addr_len; int first_proc_idx; - } atl_ofi_prov_t; typedef struct { atl_ep_t ep; + + /* used to make progressing only for really used providers */ + size_t active_prov_count; + size_t active_prov_idxs[ATL_OFI_MAX_ACTIVE_PROV_COUNT]; + } atl_ofi_ep_t; typedef struct { @@ -179,10 +193,13 @@ typedef struct { pm_rt_desc_t* pm_rt; atl_ofi_prov_t provs[ATL_OFI_MAX_PROV_COUNT]; size_t prov_count; + size_t nw_prov_count; + size_t nw_prov_first_idx; size_t shm_prov_idx; - size_t nw_prov_idx; size_t max_retry_count; atl_progress_mode_t progress_mode; + atl_mnic_t mnic_type; + size_t mnic_count; } atl_ofi_ctx_t; typedef struct { @@ -196,9 +213,10 @@ typedef struct { typedef struct atl_ofi_global_data { size_t ctx_count; int is_env_inited; + void* dlhandle; char prov_env_copy[ATL_OFI_MAX_PROV_ENV_LEN]; - atl_ofi_global_data() : ctx_count(0), is_env_inited(0) { + atl_ofi_global_data() : ctx_count(0), is_env_inited(0), dlhandle(NULL) { memset(prov_env_copy, 0, sizeof(prov_env_copy)); } } atl_ofi_global_data_t; @@ -217,28 +235,48 @@ static void atl_ofi_print_coord(atl_proc_coord_t* coord) { "]"); } +static std::string atl_ofi_get_nic_name(const struct fi_info* prov) { + std::stringstream ss; + //ss << prov->fabric_attr->prov_name << ":" << prov->fabric_attr->name << ":" << prov->domain_attr->name; + ss << prov->fabric_attr->prov_name << ":" << prov->domain_attr->name; + return ss.str(); +} + static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_size) { size_t prov_idx; atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx); - if (ofi_ctx->prov_count == 1) { - prov_idx = 0; - } - else { - CCL_THROW_IF_NOT(ofi_ctx->prov_count == ATL_OFI_MAX_PROV_COUNT, - "unexpected prov_count ", - ofi_ctx->prov_count); + CCL_THROW_IF_NOT(ofi_ctx->prov_count <= ATL_OFI_MAX_PROV_COUNT, + "unexpected prov_count ", + ofi_ctx->prov_count); + + atl_proc_coord_t* coord = &(ep->ctx->coord); + int my_node_idx = coord->global_idx / coord->local_count; + int peer_node_idx = peer_proc_idx / coord->local_count; - atl_proc_coord_t* coord = &(ep->ctx->coord); - int my_node_idx = coord->global_idx / coord->local_count; - int peer_node_idx = peer_proc_idx / coord->local_count; + int has_shm = (ofi_ctx->prov_count == ofi_ctx->nw_prov_count + 1) ? 1 : 0; - if ((my_node_idx == peer_node_idx) && - (msg_size <= ofi_ctx->provs[ofi_ctx->shm_prov_idx].max_msg_size)) - prov_idx = ofi_ctx->shm_prov_idx; - else - prov_idx = ofi_ctx->nw_prov_idx; + if (has_shm && (my_node_idx == peer_node_idx) && + (msg_size <= ofi_ctx->provs[ofi_ctx->shm_prov_idx].max_msg_size)) { + prov_idx = ofi_ctx->shm_prov_idx; } + else { + size_t nw_prov_offset = ep->idx % ofi_ctx->nw_prov_count; + prov_idx = ofi_ctx->nw_prov_first_idx + nw_prov_offset; + } + + LOG_DEBUG("get_prov: ep_idx ", + ep->idx, + ", prov_idx ", + prov_idx, + ", my_node_idx ", + my_node_idx, + ", peer_node_idx ", + peer_node_idx, + ", msg_size ", + msg_size, + ", has_shm ", + has_shm); /* TODO: add segmentation logic */ CCL_THROW_IF_NOT(msg_size <= ofi_ctx->provs[prov_idx].max_msg_size, @@ -377,7 +415,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx, return ATL_STATUS_SUCCESS; LOG_DEBUG("name ", - prov->info->fabric_attr->prov_name, + atl_ofi_get_nic_name(prov->info), ", is_shm ", prov->is_shm, ", addr_len ", @@ -791,7 +829,7 @@ static atl_status_t atl_ofi_prov_ep_init(atl_ofi_prov_t* prov, size_t ep_idx) { err: atl_ofi_prov_ep_destroy(prov, ep); - return RET2ATL(ret); + return ATL_STATUS_FAILURE; } static atl_status_t atl_ofi_try_to_drain_cq_err(struct fid_cq* cq) { @@ -903,7 +941,7 @@ static atl_status_t atl_ofi_adjust_env(const atl_attr_t& attr) { memcpy(global_data.prov_env_copy, prov_env, strlen(prov_env)); } - if (attr.enable_shm) { + if (attr.in.enable_shm) { /* add shm provider in the list of allowed providers */ if (prov_env && !strstr(prov_env, ATL_OFI_SHM_PROV_NAME)) { /* whether single provider will be in the final env variable */ @@ -925,7 +963,7 @@ static atl_status_t atl_ofi_adjust_env(const atl_attr_t& attr) { snprintf(prov_env_new, prov_env_new_size, "%s,%s", prov_env, ATL_OFI_SHM_PROV_NAME); } - LOG_INFO("ATL/SHM is requested, modify FI_PROVIDER: old value: ", + LOG_INFO("atl-ofi-shm is requested, modify FI_PROVIDER: old value: ", prov_env, ", new value: ", prov_env_new); @@ -951,6 +989,23 @@ static atl_status_t atl_ofi_set_env(const atl_attr_t& attr) { setenv("HFI_NO_CPUAFFINITY", "1", 0); setenv("PSM2_MULTI_EP", "1", 0); + setenv("FI_PSM3_DELAY", "0", 0); + setenv("FI_PSM3_TIMEOUT", "0", 0); + setenv("FI_PSM3_LOCK_LEVEL", "1", 0); + setenv("FI_PSM3_NAME_SERVER", "0", 0); + setenv("PSM3_NO_CPUAFFINITY", "1", 0); + setenv("PSM3_RDMA", "2", 0); + setenv("PSM3_MR_CACHE_MODE", "0", 0); //TODO temporary + setenv("PSM3_MULTI_EP", "1", 0); + if (attr.in.mnic_type == ATL_MNIC_NONE) + setenv("PSM3_NIC", "any", 0); + + char* hydra_uuid_env = getenv("I_MPI_HYDRA_UUID"); + if (hydra_uuid_env) { + setenv("FI_PSM2_UUID", hydra_uuid_env, 0); + setenv("FI_PSM3_UUID", hydra_uuid_env, 0); + } + setenv("FI_OFI_RXM_USE_HASH", "0", 0); setenv("FI_OFI_RXM_RX_SIZE", "8192", 0); setenv("FI_OFI_RXM_TX_SIZE", "8192", 0); @@ -960,8 +1015,22 @@ static atl_status_t atl_ofi_set_env(const atl_attr_t& attr) { setenv("FI_SHM_TX_SIZE", "8192", 0); setenv("FI_SHM_RX_SIZE", "8192", 0); +#ifdef CCL_ENABLE_SYCL + setenv("FI_SHM_DISABLE_CMA", "1", 0); +#endif /* CCL_ENABLE_SYCL */ + atl_ofi_adjust_env(attr); + /* + load libfabric symbols into global namespace + to workaround issue with undefined symbols + in case of out-of-tree providers, like OFI/PSM3 + */ + global_data.dlhandle = dlopen("libfabric.so", RTLD_GLOBAL | RTLD_NOW); + if (global_data.dlhandle == NULL) { + CCL_THROW("dlopen (libfabric.so): ", dlerror()); + } + global_data.is_env_inited = 1; return ATL_STATUS_SUCCESS; @@ -989,6 +1058,14 @@ static atl_status_t atl_ofi_finalize(atl_ctx_t* ctx) { } if (global_data.ctx_count == 0) { + if (global_data.dlhandle) { + dlclose(global_data.dlhandle); + } + + if (hwloc_is_initialized()) { + CCL_THROW_IF_NOT(hwloc_finalize() == HWLOC_SUCCESS, "failed to finalize hwloc"); + } + if (ctx->coord.global_idx == 0) { LOG_INFO("finalized last atl-ofi ctx"); } @@ -1420,17 +1497,14 @@ static inline atl_status_t atl_ofi_ep_progress(atl_ep_t* ep, atl_ofi_req_t* req ssize_t ret; size_t idx; struct fi_cq_tagged_entry entries[ATL_OFI_CQ_BUNCH_SIZE]; + atl_ofi_ep_t* ofi_ep = container_of(ep, atl_ofi_ep_t, ep); + atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx); + size_t ep_idx = ep->idx; - atl_ofi_ctx_t* ofi_ctx; - size_t ep_idx; - - ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx); - ep_idx = ep->idx; - - /* ensure progress for all providers */ - for (idx = 0; idx < ofi_ctx->prov_count; idx++) { + /* ensure progress for all active providers */ + for (idx = 0; idx < ofi_ep->active_prov_count; idx++) { atl_ofi_prov_ep_t* prov_ep; - prov_ep = &(ofi_ctx->provs[idx].eps[ep_idx]); + prov_ep = &(ofi_ctx->provs[ofi_ep->active_prov_idxs[idx]].eps[ep_idx]); do { ret = fi_cq_read(prov_ep->cq, entries, ATL_OFI_CQ_BUNCH_SIZE); if (ret > 0) @@ -1513,27 +1587,297 @@ static atl_comp_ops_t atl_ofi_ep_comp_ops = { .wait = atl_ofi_ep_wait, .poll = atl_ofi_ep_poll, .check = atl_ofi_ep_check }; +static atl_status_t atl_ofi_get_prov_list(atl_ctx_t* ctx, + const char* prov_name, + struct fi_info* base_hints, + struct fi_info** out_prov_list) { + struct fi_info* hints = NULL; + struct fi_info* prov_list = NULL; + ssize_t ret = 0; + int fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION); + const char* prov_name_str = (prov_name) ? prov_name : "<default>"; + + hints = fi_dupinfo(base_hints); + if (!hints) { + LOG_ERROR("fi_dupinfo error"); + goto err; + } + + *out_prov_list = NULL; + + LOG_DEBUG("request providers with name: ", prov_name_str); + + hints->fabric_attr->prov_name = (prov_name) ? strdup(prov_name) : NULL; + + ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &prov_list); + if (ret || !prov_list) { + LOG_ERROR("fi_getinfo error: ret ", ret, ", providers ", (void*)prov_list); + goto err; + } + + if (prov_list->domain_attr->max_ep_tx_ctx > 1) { + hints->ep_attr->tx_ctx_cnt = ctx->ep_count; + hints->ep_attr->rx_ctx_cnt = ctx->ep_count; + } + else { + hints->ep_attr->tx_ctx_cnt = 1; + hints->ep_attr->rx_ctx_cnt = 1; + } + + fi_freeinfo(prov_list); + prov_list = NULL; + + ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &prov_list); + if (ret || !prov_list) { + LOG_ERROR("fi_getinfo error, prov_name ", prov_name_str); + goto err; + } + + fi_freeinfo(hints); + hints = NULL; + + *out_prov_list = prov_list; + return ATL_STATUS_SUCCESS; + +err: + LOG_ERROR("can't create providers for name ", prov_name_str); + return ATL_STATUS_FAILURE; +} + +static atl_status_t atl_ofi_prov_init(atl_ctx_t* ctx, + struct fi_info* info, + atl_ofi_prov_t* prov, + ipmi* pmi) { + struct fi_av_attr av_attr; + size_t ep_idx = 0; + ssize_t ret = 0; + + memset(&av_attr, 0, sizeof(av_attr)); + + atl_ofi_ctx_t* ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx); + + if (ctx->coord.global_idx == 0) { + LOG_INFO("provider: ", info->fabric_attr->prov_name); + LOG_INFO(" nic: ", atl_ofi_get_nic_name(info)); + LOG_INFO(" mr_mode: ", info->domain_attr->mr_mode); + LOG_INFO(" threading: ", info->domain_attr->threading); + LOG_INFO(" tx_ctx_cnt: ", info->domain_attr->tx_ctx_cnt); + LOG_INFO(" max_ep_tx_ctx: ", info->domain_attr->max_ep_tx_ctx); + LOG_INFO(" max_msg_size: ", info->ep_attr->max_msg_size); + } + + prov->info = fi_dupinfo(info); + + if (!prov->info) { + LOG_ERROR("fi_dupinfo error"); + goto err; + } + + prov->max_msg_size = info->ep_attr->max_msg_size; + + ATL_OFI_CALL(fi_fabric(info->fabric_attr, &prov->fabric, NULL), ret, goto err); + + ATL_OFI_CALL(fi_domain(prov->fabric, info, &prov->domain, NULL), ret, goto err); + + av_attr.type = FI_AV_TABLE; + av_attr.rx_ctx_bits = prov->rx_ctx_bits = (int)ceil(log2(prov->info->ep_attr->rx_ctx_cnt)); + + ATL_OFI_CALL(fi_av_open(prov->domain, &av_attr, &prov->av, NULL), ret, goto err); + + if (info->domain_attr->max_ep_tx_ctx > 1) { + ATL_OFI_CALL(fi_scalable_ep(prov->domain, info, &prov->sep, NULL), ret, goto err); + ATL_OFI_CALL(fi_scalable_ep_bind(prov->sep, &prov->av->fid, 0), ret, goto err); + } + + prov->eps = (atl_ofi_prov_ep_t*)calloc(1, sizeof(atl_ofi_prov_ep_t) * ctx->ep_count); + if (!prov->eps) { + LOG_ERROR("can't allocate prov->eps"); + goto err; + } + + for (ep_idx = 0; ep_idx < ctx->ep_count; ep_idx++) { + ret = atl_ofi_prov_ep_init(prov, ep_idx); + if (ret) { + LOG_ERROR("atl_ofi_prov_ep_init error"); + goto err; + } + } + + if (prov->sep) { + fi_enable(prov->sep); + } + + /* TODO: make separate function to be called on CCL comm creation */ + ret = atl_ofi_prov_eps_connect(ofi_ctx, prov->idx, pmi); + if (ret) { + LOG_ERROR("atl_ofi_prov_eps_connect error, prov_idx ", prov->idx); + goto err; + } + + return ATL_STATUS_SUCCESS; + +err: + LOG_ERROR("can't init provider ", atl_ofi_get_nic_name(info)); + return ATL_STATUS_FAILURE; +} + +/* determine if NIC has already been included in others */ +static int atl_ofi_nic_already_used(const struct fi_info* prov, + struct fi_info** others, + size_t nic_count) { + for (size_t i = 0; i < nic_count; i++) { + if (prov->nic->bus_attr->bus_type == FI_BUS_PCI && + others[i]->nic->bus_attr->bus_type == FI_BUS_PCI) { + struct fi_pci_attr pci = prov->nic->bus_attr->attr.pci; + struct fi_pci_attr other_pci = others[i]->nic->bus_attr->attr.pci; + LOG_DEBUG("compare nic ", + prov->fabric_attr->prov_name, + " pci ", + (int)pci.domain_id, + ":", + (int)pci.bus_id, + ":", + (int)pci.device_id, + ":", + (int)pci.function_id, + " with nic ", + others[i]->fabric_attr->prov_name, + " pci ", + (int)other_pci.domain_id, + ":", + (int)other_pci.bus_id, + ":", + (int)other_pci.device_id, + ":", + (int)other_pci.function_id); + if (pci.domain_id == other_pci.domain_id && pci.bus_id == other_pci.bus_id && + pci.device_id == other_pci.device_id && pci.function_id == other_pci.function_id) + return 1; + } + else { + LOG_DEBUG("compare nic ", + atl_ofi_get_nic_name(prov), + " with nic ", + atl_ofi_get_nic_name(others[i])); + if (!strcmp(prov->domain_attr->name, others[i]->domain_attr->name)) + return 1; + } + } + return 0; +} + +/* return true if the NIC is bound to the same socket as calling process */ +static int atl_ofi_is_nic_local(struct fi_info* info) { + if (info->nic->bus_attr->bus_type == FI_BUS_PCI) { + struct fi_pci_attr pci = info->nic->bus_attr->attr.pci; + return hwloc_is_dev_close_by_pci(pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); + } + return 0; +} + +static atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx, struct fi_info* base_hints, ipmi* pmi) { + struct fi_info* prov_list = NULL; + size_t idx = 0, prov_idx = 0; + char* prov_name = NULL; + atl_ofi_prov_t* prov = NULL; + + atl_ofi_ctx_t* ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx); + + if (strlen(global_data.prov_env_copy) && !strstr(global_data.prov_env_copy, ",")) + prov_name = global_data.prov_env_copy; + else + prov_name = NULL; + + ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err); + + if (ofi_ctx->mnic_type == ATL_MNIC_NONE) { + prov_idx = ofi_ctx->nw_prov_first_idx; + prov = &ofi_ctx->provs[prov_idx]; + prov->idx = prov_idx; + prov->is_shm = 0; + ATL_CALL(atl_ofi_prov_init(ctx, prov_list, prov, pmi), goto err); + ofi_ctx->nw_prov_count++; + } + else { + /* calculate the number of NICs */ + struct fi_info* prov_iter = prov_list; + struct fi_info* filtered_prov_list[ATL_OFI_MAX_NW_PROV_COUNT]; + size_t nic_count = 0; + struct fid_nic* nic = NULL; + + while (prov_iter && (nic_count < ofi_ctx->mnic_count)) { + nic = prov_iter->nic; + if (nic) { + LOG_DEBUG("check nic ", atl_ofi_get_nic_name(prov_iter)); + if (!atl_ofi_nic_already_used(prov_iter, filtered_prov_list, nic_count)) { + int is_local = atl_ofi_is_nic_local(prov_iter); + LOG_DEBUG("nic ", atl_ofi_get_nic_name(prov_iter), ", is_local ", is_local); + + if (ofi_ctx->mnic_type == ATL_MNIC_GLOBAL || + (ofi_ctx->mnic_type == ATL_MNIC_LOCAL && is_local)) { + LOG_INFO("found suitable nic ", atl_ofi_get_nic_name(prov_iter)); + filtered_prov_list[nic_count] = fi_dupinfo(prov_iter); + nic_count++; + } + } + else { + LOG_DEBUG("nic ", atl_ofi_get_nic_name(prov_iter), " already used"); + } + } + prov_iter = prov_iter->next; + } + + if (nic_count == 0) { + LOG_INFO("can not find nic(s) according to mnic_type ", + ofi_ctx->mnic_type, + ", use first available nic ", + atl_ofi_get_nic_name(prov_list)); + ofi_ctx->nw_prov_count = 1; + filtered_prov_list[0] = fi_dupinfo(prov_list); + } + else { + LOG_INFO("found ", nic_count, " nic(s) according to mnic_type ", ofi_ctx->mnic_type); + ofi_ctx->nw_prov_count = nic_count; + } + + for (idx = 0; idx < ofi_ctx->nw_prov_count; idx++) { + prov_idx = ofi_ctx->nw_prov_first_idx + idx; + prov = &ofi_ctx->provs[prov_idx]; + prov->idx = prov_idx; + prov->is_shm = 0; + ATL_CALL(atl_ofi_prov_init(ctx, filtered_prov_list[idx], prov, pmi), goto err); + } + + for (idx = 0; idx < ofi_ctx->nw_prov_count; idx++) { + fi_freeinfo(filtered_prov_list[idx]); + } + } + ofi_ctx->prov_count += ofi_ctx->nw_prov_count; + + fi_freeinfo(prov_list); + + return ATL_STATUS_SUCCESS; + +err: + LOG_ERROR("can not open network providers"); + return ATL_STATUS_FAILURE; +} + static atl_status_t atl_ofi_init(int* argc, char*** argv, atl_attr_t* attr, atl_ctx_t** out_ctx, const char* main_addr, ipmi* pmi) { - struct fi_info *providers, *base_hints, *prov_hints; - struct fi_av_attr av_attr; + struct fi_info *prov_list = NULL, *base_hints = NULL, *prov_hints = NULL; int fi_version; - ssize_t ret; - size_t idx, ep_idx; - - providers = NULL; - base_hints = NULL; - prov_hints = NULL; - - memset(&av_attr, 0, sizeof(av_attr)); - - ret = 0; - idx = 0; - ep_idx = 0; + ssize_t ret = 0; + size_t idx = 0, ep_idx = 0, prov_idx = 0; + char* prov_name = NULL; + atl_ofi_prov_t* prov = NULL; + char *max_retry_count_env = NULL, *progress_mode_env = NULL; + int open_nw_provs = 1; + int enable_shm = 0; CCL_THROW_IF_NOT((sizeof(atl_ofi_req_t) <= sizeof(atl_req_t) - offsetof(atl_req_t, internal)), "unexpected offset: atl_ofi_request size ", @@ -1561,8 +1905,8 @@ static atl_status_t atl_ofi_init(int* argc, ctx->ops = &atl_ofi_ops; ctx->mr_ops = &atl_ofi_mr_ops; - ctx->ep_count = attr->ep_count; - ctx->eps = (atl_ep**)calloc(1, sizeof(void*) * attr->ep_count); + ctx->ep_count = attr->in.ep_count; + ctx->eps = (atl_ep**)calloc(1, sizeof(void*) * attr->in.ep_count); if (!ctx->eps) goto err; @@ -1609,233 +1953,78 @@ static atl_status_t atl_ofi_init(int* argc, goto err; } - if (!attr->enable_shm) { + if (!attr->in.enable_shm) { LOG_ERROR( "shm provider is requested through FI_PROVIDER but not requested from CCL level"); goto err; } } + atl_ofi_print_coord(coord); - if (attr->enable_shm) { + enable_shm = attr->in.enable_shm; + if (enable_shm) { prov_hints = fi_dupinfo(base_hints); prov_hints->fabric_attr->prov_name = strdup(ATL_OFI_SHM_PROV_NAME); - ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers); - if (ret || !providers) { - attr->enable_shm = 0; + ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &prov_list); + if (ret || !prov_list) { + enable_shm = 0; LOG_INFO("shm provider is requested but not available"); } else { LOG_INFO("shm provider is requested and available"); } - fi_freeinfo(providers); - providers = NULL; + fi_freeinfo(prov_list); + prov_list = NULL; fi_freeinfo(prov_hints); prov_hints = NULL; - - if (attr->enable_shm) { - /* TODO: tmp code to detect CMA, remove when OFI/shm will have runtime detection */ - int scope = 0, fret; - FILE* file; - file = fopen("/proc/sys/kernel/yama/ptrace_scope", "r"); - if (file) { - fret = fscanf(file, "%d", &scope); - if (fret != 1) { - LOG_ERROR("error getting value from ptrace_scope"); - scope = 1; - } - fret = fclose(file); - if (fret) { - LOG_ERROR("error closing ptrace_scope file"); - scope = 1; - } - } - - if (!file && (errno != ENOENT)) { - LOG_ERROR("can't open ptrace_scope file, disable shm provider"); - attr->enable_shm = 0; - } - else if (scope) { - LOG_ERROR("ptrace_scope > 0, disable shm provider"); - attr->enable_shm = 0; - } - } } - attr->tag_bits = 64; - attr->max_tag = 0xFFFFFFFFFFFFFFFF; - - if (coord->global_count == coord->local_count) { - ofi_ctx->prov_count = 1; - ofi_ctx->provs[0].is_shm = (attr->enable_shm) ? 1 : 0; - } - else { - if (attr->enable_shm) { - ofi_ctx->prov_count = 2; - ofi_ctx->shm_prov_idx = 0; - ofi_ctx->nw_prov_idx = 1; - ofi_ctx->provs[ofi_ctx->shm_prov_idx].is_shm = 1; - ofi_ctx->provs[ofi_ctx->nw_prov_idx].is_shm = 0; - } - else { - ofi_ctx->prov_count = 1; - ofi_ctx->provs[0].is_shm = 0; + ofi_ctx->prov_count = 0; + ofi_ctx->nw_prov_count = 0; + ofi_ctx->shm_prov_idx = 0; + ofi_ctx->nw_prov_first_idx = (enable_shm) ? 1 : 0; + ofi_ctx->mnic_type = attr->in.mnic_type; + ofi_ctx->mnic_count = std::min(attr->in.mnic_count, (size_t)(ATL_OFI_MAX_NW_PROV_COUNT)); + ofi_ctx->mnic_count = std::min(ofi_ctx->mnic_count, attr->in.ep_count); + ofi_ctx->mnic_count = std::max(ofi_ctx->mnic_count, (size_t)(1)); + + if ((ofi_ctx->mnic_type != ATL_MNIC_NONE) && !hwloc_is_initialized()) { + hwloc_status_t hwloc_status = hwloc_init(); + if (hwloc_status != HWLOC_SUCCESS) { + ofi_ctx->mnic_type = ATL_MNIC_NONE; + ofi_ctx->mnic_count = 1; + LOG_WARN("can't init hwloc, disable multi-nic") } } - if (attr->enable_rma && (ofi_ctx->prov_count > 1)) { - LOG_INFO("RMA and multiple providers requested both, disable RMA"); - attr->enable_rma = 0; + /* open SHM provider */ + if (enable_shm) { + prov_idx = ofi_ctx->shm_prov_idx; + prov_name = strdup(ATL_OFI_SHM_PROV_NAME); + prov = &ofi_ctx->provs[prov_idx]; + prov->idx = prov_idx; + prov->is_shm = 1; + ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err); + ATL_CALL(atl_ofi_prov_init(ctx, prov_list, prov, pmi), goto err); + free(prov_name); + fi_freeinfo(prov_list); + ofi_ctx->prov_count++; } - if (coord->global_idx == 0) - LOG_INFO("prov_count ", ofi_ctx->prov_count); - - for (idx = 0; idx < ofi_ctx->prov_count; idx++) { - atl_ofi_prov_t* prov; - prov = &ofi_ctx->provs[idx]; - - prov_hints = fi_dupinfo(base_hints); - - char* prov_name = NULL; - - if (prov->is_shm) - prov_name = strdup(ATL_OFI_SHM_PROV_NAME); - else { - if (strlen(global_data.prov_env_copy) && !strstr(global_data.prov_env_copy, ",")) - prov_name = strdup(global_data.prov_env_copy); - else - prov_name = NULL; - } - - LOG_DEBUG("request provider: idx ", - idx, - ", name ", - (prov_name) ? prov_name : "<default>", - ", is_shm ", - prov->is_shm); - - prov_hints->fabric_attr->prov_name = prov_name; - - ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers); - - if (ret || !providers) { - LOG_ERROR( - "fi_getinfo erro: ret ", ret, ", providers ", (void*)providers, ", prov_idx ", idx); - goto err; - } - - if (providers->domain_attr->max_ep_tx_ctx > 1) { - prov_hints->ep_attr->tx_ctx_cnt = attr->ep_count; - prov_hints->ep_attr->rx_ctx_cnt = attr->ep_count; - } - else { - prov_hints->ep_attr->tx_ctx_cnt = 1; - prov_hints->ep_attr->rx_ctx_cnt = 1; - } - - fi_freeinfo(providers); - providers = NULL; - - if (attr->enable_rma) { - LOG_INFO("try to enable RMA"); - prov_hints->caps |= FI_RMA | FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE; - prov_hints->domain_attr->mr_mode = FI_MR_UNSPEC; - // TODO: - //hints->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR | FI_MR_LOCAL | FI_MR_BASIC; - } - - ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers); - if (ret || !providers) { - if (attr->enable_rma) { - attr->enable_rma = 0; - LOG_INFO("try without RMA"); - prov_hints->caps = FI_TAGGED; - prov_hints->domain_attr->mr_mode = FI_MR_UNSPEC; - ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers); - if (ret || !providers) { - LOG_ERROR("fi_getinfo error (rma fallback), prov_idx ", idx); - goto err; - } - } - else { - LOG_ERROR("fi_getinfo error (main path), prov_idx ", idx); - goto err; - } - } - - /* use first provider from the list of providers */ - prov->info = fi_dupinfo(providers); - struct fi_info* prov_info; - prov_info = prov->info; - if (!prov_info) { - LOG_ERROR("fi_dupinfo error"); - goto err; - } - - fi_freeinfo(providers); - providers = NULL; - - attr->max_order_waw_size = - (idx == 0) ? prov_info->ep_attr->max_order_waw_size - : MIN(attr->max_order_waw_size, prov_info->ep_attr->max_order_waw_size); - - prov->max_msg_size = prov_info->ep_attr->max_msg_size; - - if (coord->global_idx == 0) { - LOG_INFO("provider: ", prov_info->fabric_attr->prov_name); - LOG_INFO(" mr_mode: ", prov_info->domain_attr->mr_mode); - LOG_INFO(" threading: ", prov_info->domain_attr->threading); - LOG_INFO(" tx_ctx_cnt: ", prov_info->domain_attr->tx_ctx_cnt); - LOG_INFO(" max_ep_tx_ctx: ", prov_info->domain_attr->max_ep_tx_ctx); - LOG_INFO(" max_msg_size: ", prov_info->ep_attr->max_msg_size); - } - - ATL_OFI_CALL(fi_fabric(prov_info->fabric_attr, &prov->fabric, NULL), ret, goto err); - - ATL_OFI_CALL(fi_domain(prov->fabric, prov_info, &prov->domain, NULL), ret, goto err); - - av_attr.type = FI_AV_TABLE; - av_attr.rx_ctx_bits = prov->rx_ctx_bits = (int)ceil(log2(prov_hints->ep_attr->rx_ctx_cnt)); - - ATL_OFI_CALL(fi_av_open(prov->domain, &av_attr, &prov->av, NULL), ret, goto err); - - if (prov_info->domain_attr->max_ep_tx_ctx > 1) { - ATL_OFI_CALL(fi_scalable_ep(prov->domain, prov_info, &prov->sep, NULL), ret, goto err); - ATL_OFI_CALL(fi_scalable_ep_bind(prov->sep, &prov->av->fid, 0), ret, goto err); - } - - prov->eps = (atl_ofi_prov_ep_t*)calloc(1, sizeof(atl_ofi_prov_ep_t) * attr->ep_count); - if (!prov->eps) { - LOG_ERROR("can't allocate prov->eps"); - goto err; - } - - for (ep_idx = 0; ep_idx < attr->ep_count; ep_idx++) { - ret = atl_ofi_prov_ep_init(prov, ep_idx); - if (ret) { - LOG_ERROR("atl_ofi_prov_ep_init error"); - goto err; - } - } - - if (prov->sep) { - fi_enable(prov->sep); - } - - ret = atl_ofi_prov_eps_connect(ofi_ctx, idx, pmi); - if (ret) { - LOG_ERROR("atl_ofi_prov_eps_connect error, prov_idx ", idx); - goto err; - } + /* open NW provider(s) */ + if (prov_env && !strcmp(prov_env, ATL_OFI_SHM_PROV_NAME) && enable_shm) { + open_nw_provs = 0; + } - fi_freeinfo(prov_hints); - prov_hints = NULL; - } /* prov loop */ + if (open_nw_provs) { + ATL_CALL(atl_ofi_open_nw_provs(ctx, base_hints, pmi), goto err); + ofi_ctx->mnic_count = ofi_ctx->nw_prov_count; + } - for (ep_idx = 0; ep_idx < attr->ep_count; ep_idx++) { + for (ep_idx = 0; ep_idx < ctx->ep_count; ep_idx++) { atl_ofi_ep_t* ofi_ep; ofi_ep = (atl_ofi_ep_t*)calloc(1, sizeof(atl_ofi_ep_t)); if (!ofi_ep) { @@ -1852,12 +2041,31 @@ static atl_status_t atl_ofi_init(int* argc, ep->rma_ops = &atl_ofi_ep_rma_ops; ep->comp_ops = &atl_ofi_ep_comp_ops; + ofi_ep->active_prov_count = 0; + if (enable_shm) { + ofi_ep->active_prov_idxs[ofi_ep->active_prov_count] = ofi_ctx->shm_prov_idx; + ofi_ep->active_prov_count++; + } + if (open_nw_provs) { + ofi_ep->active_prov_idxs[ofi_ep->active_prov_count] = + ofi_ctx->nw_prov_first_idx + ep_idx % ofi_ctx->nw_prov_count; + ofi_ep->active_prov_count++; + } + CCL_THROW_IF_NOT(ofi_ep->active_prov_count, "no active providers for ep_idx ", ep_idx); + + if (coord->global_idx == 0) { + std::stringstream ss; + for (idx = 0; idx < ofi_ep->active_prov_count; idx++) { + ss << ofi_ep->active_prov_idxs[idx] << " "; + } + LOG_INFO("ep_idx: ", ep_idx, ", active_prov_idxs: ", ss.str()); + } + ctx->eps[ep_idx] = ep; } pmi->pmrt_barrier(); - char* max_retry_count_env; max_retry_count_env = getenv(ATL_OFI_MAX_RETRY_COUNT_ENV); if (max_retry_count_env) { ofi_ctx->max_retry_count = safe_c_strtol(max_retry_count_env, NULL, 10); @@ -1873,7 +2081,6 @@ static atl_status_t atl_ofi_init(int* argc, ofi_ctx->progress_mode = ATL_PROGRESS_POLL; } - char* progress_mode_env; progress_mode_env = getenv(ATL_PROGRESS_MODE_ENV); if (progress_mode_env) { ofi_ctx->progress_mode = static_cast<atl_progress_mode_t>(atoi(progress_mode_env)); @@ -1882,6 +2089,12 @@ static atl_status_t atl_ofi_init(int* argc, if (coord->global_idx == 0) { LOG_INFO("atl-ofi-ctx:"); LOG_INFO(" new ctx_count: ", global_data.ctx_count); + LOG_INFO(" prov_count: ", ofi_ctx->prov_count); + LOG_INFO(" nw_prov_count: ", ofi_ctx->nw_prov_count); + LOG_INFO(" nw_prov_first_idx: ", ofi_ctx->nw_prov_first_idx); + LOG_INFO(" mnic_type: ", ofi_ctx->mnic_type); + if (ofi_ctx->mnic_type != ATL_MNIC_NONE) + LOG_INFO(" mnic_count: ", ofi_ctx->mnic_count); LOG_INFO(" max_retry_count: ", ofi_ctx->max_retry_count); LOG_INFO(" progress_mode: ", ofi_ctx->progress_mode); } @@ -1891,13 +2104,23 @@ static atl_status_t atl_ofi_init(int* argc, fi_freeinfo(base_hints); base_hints = NULL; + /* report actual attributes back to upper level */ + attr->out.enable_shm = enable_shm; + attr->out.enable_rma = 0; + attr->out.enable_device_buf = 0; + attr->out.mnic_type = ofi_ctx->mnic_type; + attr->out.mnic_count = ofi_ctx->mnic_count; + attr->out.tag_bits = 64; + attr->out.max_tag = 0xFFFFFFFFFFFFFFFF; + attr->out.max_order_waw_size = 0; + return ATL_STATUS_SUCCESS; err: LOG_ERROR("can't find suitable provider"); - if (providers) { - fi_freeinfo(providers); + if (prov_list) { + fi_freeinfo(prov_list); } if (base_hints) { @@ -1911,7 +2134,7 @@ static atl_status_t atl_ofi_init(int* argc, if (ctx != NULL) atl_ofi_finalize(ctx); - return RET2ATL(ret); + return ATL_STATUS_FAILURE; } atl_status_t atl_ofi_main_addr_reserve(char* main_addr) { diff --git a/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt b/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp index a8979efa9..280a720b0 100644 --- a/src/ccl_api_functions.cpp +++ b/src/ccl_api_functions.cpp @@ -53,12 +53,6 @@ void register_gpu_module(std::string kernels_path) { LOG_INFO("SPIRV kernels directory: ", kernels_path); - /* - * TODO: - * Important: Fix kernels data types generations, then uncoment - * the registration module. - */ - load_gpu_module( kernels_path + "ring_allgatherv.spv", ccl::device_topology_type::ring, ccl_coll_allgatherv); load_gpu_module( @@ -214,7 +208,6 @@ event allgatherv(const void* send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps); @@ -228,7 +221,6 @@ event allgatherv(const void* send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps); @@ -243,7 +235,6 @@ event allgatherv(const void* send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps); @@ -257,7 +248,6 @@ event allgatherv(const void* send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps); @@ -272,7 +262,6 @@ event allgatherv(const BufferType* send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps); @@ -286,7 +275,6 @@ event allgatherv(const BufferType* send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps); @@ -301,7 +289,6 @@ event allgatherv(const BufferType* send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps); @@ -315,7 +302,6 @@ event allgatherv(const BufferType* send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps); @@ -330,7 +316,6 @@ event allgatherv(const BufferObjectType& send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps); @@ -344,7 +329,6 @@ event allgatherv(const BufferObjectType& send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps); @@ -359,7 +343,6 @@ event allgatherv(const BufferObjectType& send_buf, const stream& op_stream, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps); @@ -373,7 +356,6 @@ event allgatherv(const BufferObjectType& send_buf, const communicator& comm, const allgatherv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allgatherv( send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps); @@ -389,7 +371,6 @@ event allreduce(const void* send_buf, const stream& op_stream, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce( send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps); @@ -403,7 +384,6 @@ event allreduce(const void* send_buf, const communicator& comm, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce( send_buf, recv_buf, count, dtype, reduction, disp(default_stream), attr, deps); @@ -418,7 +398,6 @@ event allreduce(const BufferType* send_buf, const stream& op_stream, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps); } @@ -431,7 +410,6 @@ event allreduce(const BufferType* send_buf, const communicator& comm, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce( send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps); @@ -446,7 +424,6 @@ event allreduce(const BufferObjectType& send_buf, const stream& op_stream, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps); } @@ -459,7 +436,6 @@ event allreduce(const BufferObjectType& send_buf, const communicator& comm, const allreduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->allreduce( send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps); @@ -474,7 +450,6 @@ event alltoall(const void* send_buf, const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps); } @@ -486,7 +461,6 @@ event alltoall(const void* send_buf, const communicator& comm, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(default_stream), attr, deps); } @@ -499,7 +473,6 @@ event alltoall(const vector_class<void*>& send_buf, const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps); } @@ -512,7 +485,6 @@ event alltoall(const BufferType* send_buf, const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps); } @@ -524,7 +496,6 @@ event alltoall(const BufferType* send_buf, const communicator& comm, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps); } @@ -537,7 +508,6 @@ event alltoall(const vector_class<BufferType*>& send_buf, const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps); } @@ -549,7 +519,6 @@ event alltoall(const vector_class<BufferType*>& send_buf, const communicator& comm, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps); } @@ -562,7 +531,6 @@ event alltoall(const BufferObjectType& send_buf, const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps); } @@ -574,7 +542,6 @@ event alltoall(const BufferObjectType& send_buf, const communicator& comm, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps); } @@ -587,7 +554,6 @@ event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& se const stream& op_stream, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps); } @@ -599,7 +565,6 @@ event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& se const communicator& comm, const alltoall_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps); } @@ -614,7 +579,6 @@ event alltoallv(const void* send_buf, const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps); @@ -628,7 +592,6 @@ event alltoallv(const void* send_buf, const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps); @@ -643,7 +606,6 @@ event alltoallv(const vector_class<void*>& send_bufs, const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps); @@ -657,7 +619,6 @@ event alltoallv(const vector_class<void*>& send_bufs, const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps); @@ -672,7 +633,6 @@ event alltoallv(const BufferType* send_buf, const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps); @@ -686,7 +646,6 @@ event alltoallv(const BufferType* send_buf, const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps); @@ -701,7 +660,6 @@ event alltoallv(const vector_class<BufferType*>& send_bufs, const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps); @@ -715,7 +673,6 @@ event alltoallv(const vector_class<BufferType*>& send_bufs, const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps); @@ -730,7 +687,6 @@ event alltoallv(const BufferObjectType& send_buf, const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps); @@ -744,7 +700,6 @@ event alltoallv(const BufferObjectType& send_buf, const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps); @@ -759,7 +714,6 @@ event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& s const stream& op_stream, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps); @@ -773,7 +727,6 @@ event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& s const communicator& comm, const alltoallv_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->alltoallv( send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps); @@ -784,13 +737,11 @@ event barrier(const communicator& comm, const stream& op_stream, const barrier_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->barrier(disp(op_stream), attr, deps); } event barrier(const communicator& comm, const barrier_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->barrier(disp(default_stream), attr, deps); } @@ -804,7 +755,6 @@ event broadcast(void* buf, const stream& op_stream, const broadcast_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, dtype, root, disp(op_stream), attr, deps); } @@ -816,7 +766,6 @@ event broadcast(void* buf, const communicator& comm, const broadcast_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, dtype, root, disp(default_stream), attr, deps); } @@ -831,7 +780,6 @@ event broadcast(BufferType* buf, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps); } @@ -845,7 +793,6 @@ event broadcast(BufferType* buf, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps); } @@ -858,7 +805,6 @@ event broadcast(BufferObjectType& buf, const stream& op_stream, const broadcast_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps); } @@ -870,7 +816,6 @@ event broadcast(BufferObjectType& buf, const communicator& comm, const broadcast_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps); } @@ -886,7 +831,6 @@ event reduce(const void* send_buf, const stream& op_stream, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, dtype, reduction, root, disp(op_stream), attr, deps); @@ -901,7 +845,6 @@ event reduce(const void* send_buf, const communicator& comm, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, dtype, reduction, root, disp(default_stream), attr, deps); @@ -917,7 +860,6 @@ event reduce(const BufferType* send_buf, const stream& op_stream, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps); @@ -932,7 +874,6 @@ event reduce(const BufferType* send_buf, const communicator& comm, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps); @@ -948,7 +889,6 @@ event reduce(const BufferObjectType& send_buf, const stream& op_stream, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps); @@ -963,7 +903,6 @@ event reduce(const BufferObjectType& send_buf, const communicator& comm, const reduce_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce( send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps); @@ -979,7 +918,6 @@ event reduce_scatter(const void* send_buf, const stream& op_stream, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps); @@ -993,7 +931,6 @@ event reduce_scatter(const void* send_buf, const communicator& comm, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, dtype, reduction, disp(default_stream), attr, deps); @@ -1008,7 +945,6 @@ event reduce_scatter(const BufferType* send_buf, const stream& op_stream, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps); @@ -1022,7 +958,6 @@ event reduce_scatter(const BufferType* send_buf, const communicator& comm, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps); @@ -1037,7 +972,6 @@ event reduce_scatter(const BufferObjectType& send_buf, const stream& op_stream, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps); @@ -1051,7 +985,6 @@ event reduce_scatter(const BufferObjectType& send_buf, const communicator& comm, const reduce_scatter_attr& attr, const vector_class<event>& deps) { - CHECK_DEPS(deps); impl_dispatch disp; return disp(comm)->reduce_scatter( send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps); @@ -1077,7 +1010,6 @@ ccl::event sparse_allreduce(const void* send_ind_buf, const ccl::stream& op_stream, const ccl::sparse_allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - CHECK_DEPS(deps); ccl::impl_dispatch disp; return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, @@ -1109,7 +1041,6 @@ ccl::event sparse_allreduce(const void* send_ind_buf, const ccl::communicator& comm, const ccl::sparse_allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - CHECK_DEPS(deps); ccl::impl_dispatch disp; return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, @@ -1141,7 +1072,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf, const ccl::stream& op_stream, const ccl::sparse_allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - CHECK_DEPS(deps); ccl::impl_dispatch disp; return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, @@ -1170,7 +1100,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf, const ccl::communicator& comm, const ccl::sparse_allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - CHECK_DEPS(deps); ccl::impl_dispatch disp; return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, @@ -1202,7 +1131,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf, // const ccl::sparse_allreduce_attr& attr, // const ccl::vector_class<ccl::event>& deps) // { -// CHECK_DEPS(deps); // ccl::impl_dispatch disp; // return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, // send_val_buf, send_val_count, @@ -1227,7 +1155,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf, // const ccl::sparse_allreduce_attr& attr, // const ccl::vector_class<ccl::event>& deps) // { -// CHECK_DEPS(deps); // ccl::impl_dispatch disp; // return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count, // send_val_buf, send_val_count, diff --git a/src/ccl_app_api_event.cpp b/src/ccl_app_api_event.cpp index 2b0dba1e9..28c699c05 100644 --- a/src/ccl_app_api_event.cpp +++ b/src/ccl_app_api_event.cpp @@ -74,15 +74,6 @@ event CCL_API event::create_from_native(native_t& native_event) { return impl_value_t(new native_event_impl(std::move(ev))); } -event CCL_API event::create_from_native(native_handle_t native_event_handle, context_t context) { - auto version = utils::get_library_version(); - - auto ev = std::unique_ptr<ccl_event>(new ccl_event(native_event_handle, context, version)); - ev->build_from_params(); - - return impl_value_t(new native_event_impl(std::move(ev))); -} - } // namespace v1 } // namespace ccl diff --git a/src/coll/algorithms/algorithm_utils.cpp b/src/coll/algorithms/algorithm_utils.cpp new file mode 100644 index 000000000..98214594a --- /dev/null +++ b/src/coll/algorithms/algorithm_utils.cpp @@ -0,0 +1,42 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "coll/algorithms/algorithms_enum.hpp" + +bool ccl_coll_type_is_reduction(ccl_coll_type ctype) { + switch (ctype) { + case ccl_coll_allreduce: + case ccl_coll_reduce: + case ccl_coll_reduce_scatter: return true; + default: return false; + } +} + +const char* ccl_coll_type_to_str(ccl_coll_type type) { + switch (type) { + case ccl_coll_allgatherv: return "allgatherv"; + case ccl_coll_allreduce: return "allreduce"; + case ccl_coll_alltoall: return "alltoall"; + case ccl_coll_alltoallv: return "alltoallv"; + case ccl_coll_barrier: return "barrier"; + case ccl_coll_bcast: return "bcast"; + case ccl_coll_reduce: return "reduce"; + case ccl_coll_reduce_scatter: return "reduce_scatter"; + case ccl_coll_sparse_allreduce: return "sparse_allreduce"; + case ccl_coll_internal: return "internal"; + default: return "unknown"; + } + return "unknown"; +} diff --git a/src/coll/algorithms/algorithms_enum.hpp b/src/coll/algorithms/algorithms_enum.hpp index 07e725e8d..7d52cbc30 100644 --- a/src/coll/algorithms/algorithms_enum.hpp +++ b/src/coll/algorithms/algorithms_enum.hpp @@ -16,6 +16,8 @@ #pragma once #include "common/utils/enums.hpp" +#include "oneapi/ccl/types.hpp" + #define CCL_COLL_LIST \ ccl_coll_allgatherv, ccl_coll_allreduce, ccl_coll_alltoall, ccl_coll_alltoallv, \ ccl_coll_barrier, ccl_coll_bcast, ccl_coll_reduce, ccl_coll_reduce_scatter, \ @@ -117,6 +119,24 @@ enum ccl_coll_type { ccl_coll_last_value }; +// Currently ccl_coll_type is used in both compile-time and run-time contexts, so +// need to have both versions of the check. +// It's possible to have a constexpr function, but it requires some features from c++14 +// (e.g. multiple returns in constexpr functions) + +template <ccl_coll_type ctype, class Enable = void> +struct is_reduction_coll_type : std::false_type {}; + +// Reduction types +template <ccl_coll_type ctype> +struct is_reduction_coll_type< + ctype, + typename std::enable_if<ctype == ccl_coll_allreduce || ctype == ccl_coll_reduce || + ctype == ccl_coll_reduce_scatter>::type> : std::true_type {}; + +bool ccl_coll_type_is_reduction(ccl_coll_type ctype); +const char* ccl_coll_type_to_str(ccl_coll_type type); + #define CCL_COLL_TYPE_LIST \ ccl_coll_type::ccl_coll_allgatherv, ccl_coll_type::ccl_coll_allreduce, \ ccl_coll_type::ccl_coll_alltoall, ccl_coll_type::ccl_coll_alltoallv, \ @@ -124,23 +144,6 @@ enum ccl_coll_type { ccl_coll_type::ccl_coll_reduce, ccl_coll_type::ccl_coll_reduce_scatter, \ ccl_coll_type::ccl_coll_sparse_allreduce -inline const char* ccl_coll_type_to_str(ccl_coll_type type) { - switch (type) { - case ccl_coll_allgatherv: return "allgatherv"; - case ccl_coll_allreduce: return "allreduce"; - case ccl_coll_alltoall: return "alltoall"; - case ccl_coll_alltoallv: return "alltoallv"; - case ccl_coll_barrier: return "barrier"; - case ccl_coll_bcast: return "bcast"; - case ccl_coll_reduce: return "reduce"; - case ccl_coll_reduce_scatter: return "reduce_scatter"; - case ccl_coll_sparse_allreduce: return "sparse_allreduce"; - case ccl_coll_internal: return "internal"; - default: return "unknown"; - } - return "unknown"; -} - enum ccl_coll_reduction { sum, prod, @@ -152,11 +155,10 @@ enum ccl_coll_reduction { }; #define REDUCE_TYPES \ - ccl_coll_reduction::sum, ccl_coll_reduction::prod, ccl_coll_reduction::min, \ - ccl_coll_reduction::max /*, ccl_coll_reduction::custom*/ + ccl::reduction::sum, ccl::reduction::prod, ccl::reduction::min, \ + ccl::reduction::max /*, ccl::reduction::custom*/ -using ccl_coll_reductions = utils::enum_to_str<static_cast<int>(ccl_coll_reduction::last_value)>; -inline const std::string reduction_to_str(ccl_coll_reduction reduction_type) { - return ccl_coll_reductions({ "sum", "prod", "min", "max" }) - .choose(reduction_type, "INVALID_VALUE"); +using ccl_reductions = utils::enum_to_str<static_cast<int>(ccl::reduction::custom)>; +inline const std::string reduction_to_str(ccl::reduction reduction_type) { + return ccl_reductions({ "sum", "prod", "min", "max" }).choose(reduction_type, "INVALID_VALUE"); } diff --git a/src/coll/algorithms/allreduce/allreduce_rma.cpp b/src/coll/algorithms/allreduce/allreduce_rma.cpp index a74c95d92..a91357f6d 100644 --- a/src/coll/algorithms/allreduce/allreduce_rma.cpp +++ b/src/coll/algorithms/allreduce/allreduce_rma.cpp @@ -354,7 +354,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched, entry->set_field_fn<ccl_sched_entry_field_dst_mr>( rma_ring_allreduce_get_remote_rs_dst_buf_mr, ar_handler); - if (block_count * dtype.size() > atl_wrapper::attr.max_order_waw_size) + if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size) sched->add_barrier(); entry = entry_factory::make_entry<write_entry>( @@ -415,7 +415,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched, entry->set_field_fn<ccl_sched_entry_field_dst_mr>(rma_ring_allreduce_get_remote_recv_buf_mr, ar_handler); - if (block_count * dtype.size() > atl_wrapper::attr.max_order_waw_size) + if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size) sched->add_barrier(); entry = entry_factory::make_entry<write_entry>( diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp index f9675483b..a411440d7 100644 --- a/src/coll/algorithms/alltoallv.cpp +++ b/src/coll/algorithms/alltoallv.cpp @@ -40,6 +40,17 @@ ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched, return ccl::status::success; } +ccl::status ccl_coll_add_scatter_alltoallv_plain_barriers(std::vector<ccl_sched*>& scheds) { + if (ccl::global_data::env().alltoall_scatter_plain) { + ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops; + for (auto s : scheds) { + if (s->entries_count() % max_ops == 0) + s->add_barrier(); + } + } + return ccl::status::success; +} + ccl::status ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& scheds, size_t sched_idx) { ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops; @@ -48,12 +59,7 @@ ccl::status ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& sch if (scheds[sched_idx]->entries_count() % max_ops == 0) scheds[sched_idx]->add_barrier(); - if (ccl::global_data::env().alltoall_scatter_plain) { - for (auto s : scheds) { - if (s->entries_count() % max_ops == 0) - s->add_barrier(); - } - } + ccl_coll_add_scatter_alltoallv_plain_barriers(scheds); } return ccl::status::success; @@ -277,8 +283,6 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched, entry_factory::make_chunked_recv_entry( scheds, sched_idx, recv_buf, recv_counts[src], dtype, src, comm); - - ccl_coll_add_scatter_alltoallv_barriers(scheds, sched_idx); } for (int idx = 0; idx < comm_size; idx++) { @@ -300,8 +304,6 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched, dtype, dst, comm); - - ccl_coll_add_scatter_alltoallv_barriers(scheds, sched_idx); } if (!inplace) @@ -345,6 +347,13 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche size_t total_send_count = 0, total_recv_count = 0; size_t total_send_bytes = 0, total_recv_bytes = 0; + ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops; + if (max_ops != CCL_ENV_SIZET_NOT_SPECIFIED) { + for (size_t idx = 0; idx < sched_count; idx++) { + scheds[idx]->flow_control.set_max_credits(max_ops); + } + } + bool inplace = (coll_param.send_buf && (coll_param.send_buf == coll_param.recv_buf)) ? true : false; @@ -419,8 +428,6 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche entry_factory::make_chunked_recv_entry( recv_scheds, sched_idx, recv_buf, recv_counts[src], dtype, src, comm); - - ccl_coll_add_scatter_alltoallv_barriers(recv_scheds, sched_idx); } for (int idx = 0; idx < comm_size; idx++) { @@ -442,8 +449,6 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche dtype, dst, comm); - - ccl_coll_add_scatter_alltoallv_barriers(send_scheds, sched_idx); } if (!inplace) diff --git a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp index 2daba7276..4e24e172a 100644 --- a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp +++ b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp @@ -333,7 +333,8 @@ ccl::status sparse_reduce_ring(const void* ctx) { for (size_t idx = 0; idx < sa_handler->send_count[0]; idx++) { auto it = sa_handler->iv_map->find(rcv_i[idx]); if (it != sa_handler->iv_map->end()) { - ccl_comp_reduce((void*)(rcv_v + idx * sa_handler->val_dim_cnt), + ccl_comp_reduce(sa_handler->sched, + (void*)(rcv_v + idx * sa_handler->val_dim_cnt), sa_handler->val_dim_cnt, snd_v + it->second[0], nullptr, @@ -548,7 +549,7 @@ ccl::status ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched, /* get value dimension */ size_t val_dim_cnt = send_val_count / send_ind_count; - CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null"); + CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null"); CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null"); void** r_ind_buf = recv_ind_buf; @@ -767,7 +768,7 @@ ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched, /* get value dimension */ size_t val_dim_cnt = send_val_count / send_ind_count; - CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null"); + CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null"); CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null"); void** r_ind_buf = recv_ind_buf; @@ -1077,7 +1078,7 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched, /* get value dimension */ size_t val_dim_cnt = send_val_count / send_ind_count; - CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null"); + CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null"); CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null"); void** r_ind_buf = recv_ind_buf; diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp index 5e19b3901..fe1622f9e 100644 --- a/src/coll/coll.cpp +++ b/src/coll/coll.cpp @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include <numeric> + #include "oneapi/ccl/types.hpp" #include "oneapi/ccl/aliases.hpp" @@ -45,10 +47,12 @@ #include "coll/ccl_reduce_op_attr.hpp" #include "coll/ccl_reduce_scatter_op_attr.hpp" #include "coll/ccl_sparse_allreduce_op_attr.hpp" +#include "coll/coll_param.hpp" #include "common/global/global.hpp" #include "coll/algorithms/algorithms.hpp" +#include "coll/algorithms/algorithms_enum.hpp" #include "coll/algorithms/allreduce/allreduce_2d.hpp" #include "coll/algorithms/sparse_allreduce/sparse_allreduce.hpp" #include "coll/selection/selection.hpp" @@ -116,8 +120,66 @@ ccl_coll_attr::ccl_coll_attr(const ccl::sparse_allreduce_attr& attr) { sparse_coalesce_mode = attr.get<ccl::sparse_allreduce_attr_id::coalesce_mode>(); } +static void ccl_coll_validate_and_adjust(const ccl_coll_param& param) { + // not SYCL, don't need validation + if (param.stream == nullptr) { + return; + } + + // skip validation if it was requested explicitly (e.g. for sycl::buffer) + if (param.skip_validation) { + return; + } + +#ifdef CCL_ENABLE_SYCL + std::vector<void*> bufs = {}; + + switch (param.ctype) { + case ccl_coll_alltoallv: { + // if the sum of the counts is 0 this means that the buf pointer could be anything, + // including nullptr and invalid pointer. We should neither validate nor dereference it. + // TODO: make const void* + if (std::accumulate(param.send_counts, param.send_counts + param.comm->size(), 0) > 0) { + bufs.push_back((void*)(param.send_buf)); + } + + if (std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0) > 0) { + bufs.push_back((void*)(param.recv_buf)); + } + break; + } + case ccl_coll_allreduce: + case ccl_coll_allgatherv: + case ccl_coll_alltoall: + case ccl_coll_reduce: + case ccl_coll_reduce_scatter: + bufs = { (void*)param.send_buf, (void*)param.recv_buf }; + break; + case ccl_coll_bcast: bufs = { (void*)param.recv_buf }; break; + case ccl_coll_sparse_allreduce: + bufs = { (void*)param.sparse_param.send_ind_buf, + (void*)param.sparse_param.send_val_buf, + (void*)param.sparse_param.recv_ind_buf, + (void*)param.sparse_param.recv_val_buf }; + break; + default: + // everything that is not a collective, i.e. barrier doesn't require validation + return; + } + + auto q = param.stream->get_native_stream(); + CCL_THROW_IF_NOT( + native::detail::check_assoc_device_memory(bufs, q.get_device(), q.get_context()) != + native::detail::usm_support_mode::prohibited, + "unsupported usm type"); +#endif /* CCL_ENABLE_SYCL */ +} + /* param is not const because param.comm can be updated for unordered colls */ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr& attr) { + // perform a validation and adjustion if necessary + ccl_coll_validate_and_adjust(param); + ccl::global_data& data = ccl::global_data::get(); /* 1. decide whether schedule should be postponed (this includes caching and staring) */ @@ -689,7 +751,9 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_allgatherv; @@ -700,6 +764,8 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf, param.dtype = ccl::global_data::get().dtypes->get(dtype); param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req); @@ -713,7 +779,9 @@ ccl_request* ccl_allreduce_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_allreduce; @@ -724,6 +792,8 @@ ccl_request* ccl_allreduce_impl(const void* send_buf, param.reduction = reduction; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req, " count ", count); @@ -736,7 +806,9 @@ ccl_request* ccl_alltoall_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_alltoall; @@ -746,6 +818,8 @@ ccl_request* ccl_alltoall_impl(const void* send_buf, param.dtype = ccl::global_data::get().dtypes->get(dtype); param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req, " count ", count); @@ -759,7 +833,9 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_alltoallv; @@ -770,6 +846,8 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf, param.dtype = ccl::global_data::get().dtypes->get(dtype); param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req); @@ -784,7 +862,8 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps) { ccl_coll_param param{}; param.ctype = ccl_coll_allreduce; @@ -795,6 +874,7 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf, param.reduction = reduction; param.stream = stream; param.comm = comm; + copy_deps(deps, param.deps); auto req = ccl_gpu_coll_create(param, attr); LOG_DEBUG( @@ -802,13 +882,18 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf, return req; } -void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream) { +void ccl_barrier_impl(ccl_comm* comm, + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_barrier; param.dtype = ccl_datatype_int8; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); ccl_coll_attr attr{}; attr.synchronous = 1; @@ -829,16 +914,20 @@ ccl_request* ccl_broadcast_impl(void* buf, int root, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_bcast; - param.buf = buf; + param.send_buf = param.recv_buf = buf; param.count = count; param.dtype = ccl::global_data::get().dtypes->get(dtype); param.root = root; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req); @@ -853,7 +942,9 @@ ccl_request* ccl_reduce_impl(const void* send_buf, int root, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_reduce; @@ -865,6 +956,8 @@ ccl_request* ccl_reduce_impl(const void* send_buf, param.root = root; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req); @@ -878,7 +971,9 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_reduce_scatter; @@ -889,6 +984,8 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf, param.reduction = reduction; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); auto req = ccl_coll_create(param, attr); LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req); @@ -908,7 +1005,9 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream) { + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation) { ccl_coll_param param{}; param.ctype = ccl_coll_sparse_allreduce; @@ -925,6 +1024,8 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf, param.reduction = reduction; param.stream = stream; param.comm = comm; + param.skip_validation = skip_validation; + copy_deps(deps, param.deps); ccl_coll_attr internal_attr(attr); internal_attr.to_cache = 0; /* skip to_cache flag, unsupported yet */ diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp index 69a8cc7ff..76601f01a 100644 --- a/src/coll/coll.hpp +++ b/src/coll/coll.hpp @@ -108,7 +108,9 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_allreduce_impl(const void* send_buf, void* recv_buf, @@ -117,7 +119,9 @@ ccl_request* ccl_allreduce_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); template <class gpu_device_type> ccl_request* ccl_allreduce_gpu_impl(const void* send_buf, void* recv_buf, @@ -126,7 +130,8 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps); ccl_request* ccl_alltoall_impl(const void* send_buf, void* recv_buf, @@ -134,7 +139,9 @@ ccl_request* ccl_alltoall_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_alltoallv_impl(const void* send_buf, const size_t* send_counts, @@ -143,9 +150,14 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf, ccl::datatype dtype, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); -void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream); +void ccl_barrier_impl(ccl_comm* comm, + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_broadcast_impl(void* buf, size_t count, @@ -153,7 +165,9 @@ ccl_request* ccl_broadcast_impl(void* buf, int root, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_reduce_impl(const void* send_buf, void* recv_buf, @@ -163,7 +177,9 @@ ccl_request* ccl_reduce_impl(const void* send_buf, int root, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_reduce_scatter_impl(const void* send_buf, void* recv_buf, @@ -172,7 +188,9 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf, size_t send_ind_count, @@ -187,4 +205,6 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf, ccl::reduction reduction, const ccl_coll_attr& attr, ccl_comm* comm, - const ccl_stream* stream); + const ccl_stream* stream, + const std::vector<ccl::event>& deps, + bool skip_validation = false); diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp new file mode 100644 index 000000000..a050cdfd1 --- /dev/null +++ b/src/coll/coll_param.cpp @@ -0,0 +1,68 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "coll/coll_param.hpp" + +bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs) { + CCL_ASSERT((lhs.is_reduction() && rhs.is_reduction()) || + (!lhs.is_reduction() && !rhs.is_reduction())); + + bool res = + lhs.get_coll_type() == rhs.get_coll_type() && lhs.get_datatype() == rhs.get_datatype(); + + if (lhs.is_reduction()) { + res = res && (lhs.get_reduction() == rhs.get_reduction()); + } + + return res; +} + +void copy_deps(const std::vector<ccl::event>& in, std::vector<ccl::event>& out) { +#ifdef CCL_ENABLE_SYCL + out.clear(); + for (size_t idx = 0; idx < in.size(); idx++) { + try { + auto sycl_event = in[idx].get_native(); + out.push_back(ccl::create_event(sycl_event)); + } + catch (ccl::exception&) { + } + } +#else /* CCL_ENABLE_SYCL */ + CCL_THROW_IF_NOT(in.size() == 0, "host deps are not supported yet"); +#endif /* CCL_ENABLE_SYCL */ +} + +ccl_coll_param::ccl_coll_param(const ccl_coll_param& other) { + ctype = other.ctype; + send_buf = other.send_buf; + recv_buf = other.recv_buf; + count = other.count; + send_count = other.send_count; + send_counts = other.send_counts; + recv_counts = other.recv_counts; + dtype = other.dtype; + reduction = other.reduction; + root = other.root; + stream = other.stream; + copy_deps(other.deps, deps); + comm = other.comm; + sparse_param = other.sparse_param; + +#ifdef CCL_ENABLE_SYCL + device_send_buf = other.device_send_buf; + device_recv_buf = other.device_recv_buf; +#endif /* CCL_ENABLE_SYCL */ +} diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp index 34b857bd5..927e0ca3f 100644 --- a/src/coll/coll_param.hpp +++ b/src/coll/coll_param.hpp @@ -17,14 +17,7 @@ #include "coll/algorithms/algorithms_enum.hpp" #include "common/datatype/datatype.hpp" - -#include "oneapi/ccl/type_traits.hpp" -#include "oneapi/ccl/stream_attr_ids.hpp" -#include "oneapi/ccl/stream_attr_ids_traits.hpp" -#include "oneapi/ccl/stream.hpp" -#include "oneapi/ccl/coll_attr_ids.hpp" -#include "oneapi/ccl/coll_attr_ids_traits.hpp" -#include "oneapi/ccl/coll_attr.hpp" +#include "oneapi/ccl.hpp" class ccl_comm; @@ -44,10 +37,10 @@ using ccl_sycl_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<int8_t> ccl_sycl_typed_buffer_t<uint32_t>, ccl_sycl_typed_buffer_t<int64_t>, ccl_sycl_typed_buffer_t<uint64_t>, - ccl_sycl_typed_buffer_t<float>, //unsupported + ccl_sycl_typed_buffer_t<uint16_t>, ccl_sycl_typed_buffer_t<float>, ccl_sycl_typed_buffer_t<double>, - ccl_sycl_typed_buffer_t<float>>; //unsupported + ccl_sycl_typed_buffer_t<uint16_t>>; #endif /* CCL_ENABLE_SYCL */ #define CCL_INVALID_PROC_IDX (-1) @@ -99,9 +92,10 @@ struct ccl_coll_sparse_param { ccl_datatype itype; }; +void copy_deps(const std::vector<ccl::event>& in, std::vector<ccl::event>& out); + struct ccl_coll_param { ccl_coll_type ctype; - void* buf; const void* send_buf; void* recv_buf; size_t count; @@ -112,16 +106,61 @@ struct ccl_coll_param { ccl::reduction reduction; int root; const ccl_stream* stream; + std::vector<ccl::event> deps; ccl_comm* comm; ccl_coll_sparse_param sparse_param; + bool skip_validation; #ifdef CCL_ENABLE_SYCL - ccl_sycl_buffer_t* sycl_send_buf; - ccl_sycl_buffer_t* sycl_recv_buf; - ccl_sycl_buffer_t* sycl_buf; + ccl_sycl_buffer_t* device_send_buf; + ccl_sycl_buffer_t* device_recv_buf; #endif /* CCL_ENABLE_SYCL */ + + ccl_coll_param() {} + ccl_coll_param(const ccl_coll_param& other); }; +class coll_param_gpu { + ccl_coll_type ctype; + ccl::datatype dtype; + ccl::reduction red; + +public: + coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype, ccl::reduction red) + : ctype{ ctype }, + dtype{ dtype }, + red{ red } {} + + coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype) + : ctype{ ctype }, + dtype{ dtype }, + red{ (ccl::reduction)-1 } { + assert(!is_reduction() && "This constructor is invalid for reduction types"); + } + + ccl_coll_type get_coll_type() const { + return ctype; + } + + ccl::datatype get_datatype() const { + return dtype; + } + + bool is_reduction() const { + return ccl_coll_type_is_reduction(get_coll_type()); + } + + ccl::reduction get_reduction() const { + if (!is_reduction()) { + throw ccl::exception( + "get_ruduction(): is not supported for non-reduction collective type, i.e. bcast"); + } + return red; + } +}; + +bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs); + /* explicitly split coll_param and coll_param_copy to separate coll_param structure which is used for interaction between different modules diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp index ae8c8ce74..101adaf5f 100644 --- a/src/coll/selection/selector_allreduce.cpp +++ b/src/coll/selection/selector_allreduce.cpp @@ -61,7 +61,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use( if (algo == ccl_coll_allreduce_rabenseifner && (int)param.count < param.comm->pof2()) can_use = false; - else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.enable_rma) + else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.out.enable_rma) can_use = false; else if (algo == ccl_coll_allreduce_starlike && !(param.count / param.comm->size())) can_use = false; diff --git a/src/common/comm/comm_interface.hpp b/src/common/comm/comm_interface.hpp index e0c7eaf67..02d30b0e8 100644 --- a/src/common/comm/comm_interface.hpp +++ b/src/common/comm/comm_interface.hpp @@ -109,6 +109,8 @@ struct gpu_comm_attr; COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint64_t); \ COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, float); \ COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, double); \ + COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, ccl::bfloat16); \ + COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, ccl::float16); \ COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, float); \ COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, ccl::bfloat16); \ COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int64_t, float); \ diff --git a/src/common/comm/compiler_comm_interface_dispatcher.cpp b/src/common/comm/compiler_comm_interface_dispatcher.cpp index cd688160b..6734322bf 100644 --- a/src/common/comm/compiler_comm_interface_dispatcher.cpp +++ b/src/common/comm/compiler_comm_interface_dispatcher.cpp @@ -140,7 +140,7 @@ communicator_interface_dispatcher::create_communicator_from_unified_device( // Use process class if not specified otherwise // TODO: implement a proper dispatching for other types if (preferred_topology_group == ccl::group_split_type::undetermined) { - preferred_topology_group = ccl::group_split_type::process; + preferred_topology_group = ccl::group_split_type::cluster; } // read comm split attributes diff --git a/src/common/comm/host_communicator/host_communicator.cpp b/src/common/comm/host_communicator/host_communicator.cpp index ad135c487..0f7c5cd9d 100644 --- a/src/common/comm/host_communicator/host_communicator.cpp +++ b/src/common/comm/host_communicator/host_communicator.cpp @@ -176,7 +176,7 @@ ccl::event host_communicator::barrier_impl(const ccl::stream::impl_value_t& op_s const ccl::vector_class<ccl::event>& deps) { // TODO what exactly we need to do with 'attr' here? - ccl_barrier_impl(comm_impl.get(), op_stream.get()); + ccl_barrier_impl(comm_impl.get(), op_stream.get(), deps); // TODO what exactly we need to return here? ccl_barrier_impl() is void func ccl_request* req = nullptr; @@ -192,8 +192,15 @@ ccl::event host_communicator::allgatherv_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl_request* req = ccl_allgatherv_impl( - send_buf, send_count, recv_buf, recv_counts.data(), dtype, attr, comm_impl.get(), nullptr); + ccl_request* req = ccl_allgatherv_impl(send_buf, + send_count, + recv_buf, + recv_counts.data(), + dtype, + attr, + comm_impl.get(), + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -206,9 +213,20 @@ ccl::event host_communicator::allgatherv_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - // TODO not implemented - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + ccl_coll_attr internal_attr(attr); + internal_attr.vector_buf = 1; + + ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf), + send_count, + (void*)(recv_bufs.data()), + recv_counts.data(), + dtype, + internal_attr, + comm_impl.get(), + nullptr, + deps); + + return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } /* allreduce */ @@ -221,7 +239,7 @@ ccl::event host_communicator::allreduce_impl(const void* send_buf, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { ccl_request* req = ccl_allreduce_impl( - send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr); + send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr, deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -235,7 +253,7 @@ ccl::event host_communicator::alltoall_impl(const void* send_buf, const ccl::alltoall_attr& attr, const ccl::vector_class<ccl::event>& deps) { ccl_request* req = - ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr, comm_impl.get(), nullptr); + ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr, comm_impl.get(), nullptr, deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -268,7 +286,8 @@ ccl::event host_communicator::alltoallv_impl(const void* send_buf, dtype, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -294,7 +313,8 @@ ccl::event host_communicator::broadcast_impl(void* buf, const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl_request* req = ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), nullptr); + ccl_request* req = + ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), nullptr, deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -310,7 +330,7 @@ ccl::event host_communicator::reduce_impl(const void* send_buf, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { ccl_request* req = ccl_reduce_impl( - send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), nullptr); + send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), nullptr, deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -325,7 +345,7 @@ ccl::event host_communicator::reduce_scatter_impl(const void* send_buf, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { ccl_request* req = ccl_reduce_scatter_impl( - send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), nullptr); + send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), nullptr, deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -358,7 +378,8 @@ ccl::event host_communicator::sparse_allreduce_impl(const void* send_ind_buf, reduction, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } diff --git a/src/common/comm/host_communicator/host_communicator_impl.hpp b/src/common/comm/host_communicator/host_communicator_impl.hpp index d71640c00..a958a117a 100644 --- a/src/common/comm/host_communicator/host_communicator_impl.hpp +++ b/src/common/comm/host_communicator/host_communicator_impl.hpp @@ -42,7 +42,8 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf, ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -50,7 +51,7 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf, template <class buffer_type> ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf, size_t send_count, - ccl::vector_class<buffer_type*>& recv_buf, + ccl::vector_class<buffer_type*>& recv_bufs, const ccl::vector_class<size_t>& recv_counts, const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, @@ -60,12 +61,13 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf, ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf), send_count, - (void*)(recv_buf.data()), + (void*)(recv_bufs.data()), recv_counts.data(), ccl::native_type_info<buffer_type>::dtype, internal_attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -113,7 +115,8 @@ ccl::event host_communicator::allreduce_impl(const buffer_type* send_buf, reduction, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -145,7 +148,8 @@ ccl::event host_communicator::alltoall_impl(const buffer_type* send_buf, ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -203,7 +207,8 @@ ccl::event host_communicator::alltoallv_impl(const buffer_type* send_buf, ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -261,7 +266,8 @@ ccl::event host_communicator::broadcast_impl(buffer_type* buf, root, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -296,7 +302,8 @@ ccl::event host_communicator::reduce_impl(const buffer_type* send_buf, root, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -331,7 +338,8 @@ ccl::event host_communicator::reduce_scatter_impl(const buffer_type* send_buf, reduction, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -376,7 +384,8 @@ ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* sen reduction, attr, comm_impl.get(), - nullptr); + nullptr, + deps); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } diff --git a/src/common/comm/l0/communicator/base_communicator.hpp b/src/common/comm/l0/communicator/base_communicator.hpp index 8d03784c9..ef315d4a1 100644 --- a/src/common/comm/l0/communicator/base_communicator.hpp +++ b/src/common/comm/l0/communicator/base_communicator.hpp @@ -19,7 +19,6 @@ #include "common/comm/comm_interface.hpp" //TODO #include "sched/gpu_sched.hpp" #include "common/comm/l0/comm_context_id.hpp" -#include "common/comm/l0/modules/kernel_params.hpp" struct base_communicator : public ccl::communicator_interface { //TODO using group_comm_storage = native::specific_indexed_device_storage; diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp index 286ef7802..42b8afe38 100644 --- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp +++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp @@ -86,8 +86,75 @@ ccl::event device_group_a2a_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + int comm_rank = rank(); + LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(); + + const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); + const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); + ; + + device_group_scheduler::schedule_ptr schedule; + + //source for collective operation is real gpu or virtual gpu + auto real_device_it = in_process_gpu_storage.find(comm_rank); + if (real_device_it != in_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); + + /* TODO + + using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl, + real_device_it->second,send_entry_buffer, + recv_entry_buffer, + count, + reduction); + */ + } + else { + auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); + if (virtual_device_it != virtual_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); + /* TODO + + using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl, + virtual_device_it->second,send_entry_buffer, + recv_entry_buffer, + count, + reduction); + */ + } + } + + //if sched is not ready - send NULL + if (schedule) { + LOG_DEBUG("Device group finalized"); + } + return std::unique_ptr<ccl::event_impl>(new ccl::gpu_event_impl(std::move(schedule))); } /* alltoall */ diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp index c93cd6472..f509c5d4e 100644 --- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp @@ -87,75 +87,14 @@ ccl::event device_group_a2a_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - int comm_rank = rank(); - LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(); - - const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); - const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); - ; - - device_group_scheduler::schedule_ptr schedule; - - //source for collective operation is real gpu or virtual gpu - auto real_device_it = in_process_gpu_storage.find(comm_rank); - if (real_device_it != in_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); - - /* TODO - - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl, - real_device_it->second,send_entry_buffer, - recv_entry_buffer, - count, - reduction); - */ - } - else { - auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); - if (virtual_device_it != virtual_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); - /* TODO - - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl, - virtual_device_it->second,send_entry_buffer, - recv_entry_buffer, - count, - reduction); - */ - } - } - - //if sched is not ready - send NULL - if (schedule) { - LOG_DEBUG("Device group finalized"); - } - return std::unique_ptr<ccl::event_impl>(new ccl::gpu_event_impl(std::move(schedule))); + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> diff --git a/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp b/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp index 668efce1d..93ae547cc 100644 --- a/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp +++ b/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp @@ -17,10 +17,9 @@ #include "common/comm/l0/devices/devices_declaration.hpp" #include "common/comm/l0/device_community.hpp" -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm> struct communication_device_expander { template <class device_t, class... Args> @@ -31,7 +30,7 @@ struct communication_device_expander { if (comm_device) { LOG_DEBUG("Invoke: ", comm_device->to_string()); - using gpu_entry = algorithm<kernel_params, device_t, group_id>; + using gpu_entry = algorithm<device_t, group_id>; schedule = ctx->scheduler_impl ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>( @@ -42,10 +41,9 @@ struct communication_device_expander { std::unique_ptr<ccl_gpu_sched> schedule; }; -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm, class... Args> std::unique_ptr<ccl::event_impl> do_collective_op( @@ -55,7 +53,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op( typename native::device_community_container<class_id>::element_type community, native::ccl_driver_context_ptr native_context, Args&&... args) { - communication_device_expander<kernel_params, group_id, class_id, algorithm> expander; + communication_device_expander<group_id, class_id, algorithm> expander; ccl_tuple_for_each_args(communication_device, expander, ctx, @@ -68,66 +66,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op( return std::unique_ptr<ccl::event_impl>( new ccl::gpu_shared_event_impl(std::move(expander.schedule))); } - -template <class buffer_type, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> - class algorithm, - class... Args> -std::unique_ptr<ccl::event_impl> do_collective_op_reductions( - ccl::reduction reduction, - native::device_variant_t<native::ccl_gpu_comm, native::ccl_virtual_gpu_comm>& - communication_device, - std::shared_ptr<native::device_group_context>& ctx, - typename native::device_community_container<class_id>::element_type community, - native::ccl_driver_context_ptr native_context, - Args&&... args) { - switch (reduction) { - case ccl::reduction::sum: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>, - group_id, - class_id, - algorithm>( - communication_device, ctx, community, native_context, std::forward<Args>(args)...); - break; - case ccl::reduction::prod: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>, - group_id, - class_id, - algorithm>( - communication_device, ctx, community, native_context, std::forward<Args>(args)...); - break; - case ccl::reduction::min: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>, - group_id, - class_id, - algorithm>( - communication_device, ctx, community, native_context, std::forward<Args>(args)...); - break; - case ccl::reduction::max: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>, - group_id, - class_id, - algorithm>( - communication_device, ctx, community, native_context, std::forward<Args>(args)...); - break; - // TODO: make support of custom reduction in *.cl - // case ccl::reduction::custom: - // return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>, - // group_id, class_id, algorithm>( - // communication_device, - // ctx, - // community, - // native_context, - // std::forward<Args>(args)...); - // break; - default: - throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + - "Obtained reduction by user is incorrect!"); - } -} diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp index 4cff85f0a..95ed26cfe 100644 --- a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp +++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp @@ -87,8 +87,49 @@ ccl::event device_group_ring_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + size_t ring_index = 0; + + int comm_rank = rank(); + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: , ring_index: ", + comm_rank, + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction); + + return do_collective_op<group_id, class_id, native::l0_allreduce_typed_entry>( + communication_device, + ctx, + community, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + count, + params, + stream); } /* alltoall */ diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp index ac8116162..b6dfaea7a 100644 --- a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp @@ -88,50 +88,14 @@ ccl::event device_group_ring_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - size_t ring_index = 0; - - int comm_rank = rank(); - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: , ring_index: ", - comm_rank, - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op_reductions<buffer_type, - group_id, - class_id, - native::l0_allreduce_typed_entry>(reduction, - communication_device, - ctx, - community, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp index 43bd9fcc0..4f37872e9 100644 --- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp +++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp @@ -79,8 +79,127 @@ ccl::event process_a2a_communicator::allreduce_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(); + + const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); + const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); + + auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>(); + (void)ipc_gpu_storage; + auto& in_process_ipc_source_real_gpu_storage = + community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>(); + auto& in_process_ipc_source_virtual_gpu_storage = + community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>(); + + allied_process_group_scheduler::thread_schedule_ptr schedule; + //source for collective operation is ipc sources, real gpu or virtual gpu + auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank); + if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_ipc_source_gpu_comm<ccl_gpu_comm>, + group_id>; + + schedule = + ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id, + thread_id, + *device_community_impl, + ipc_src_real_it->second, + send_entry_buffer, + recv_entry_buffer, + count, + dtype, + reduction); + */ + } + else { + auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank); + if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>, + group_id>; + + schedule = + ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id, + thread_id, + *device_community_impl, + ipc_src_virt_it->second, + send_entry_buffer, + recv_entry_buffer, + count, + dtype, + reduction); + */ + } + else { + auto real_device_it = in_process_gpu_storage.find(comm_rank); + if (real_device_it != in_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_gpu_comm, group_id>; + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id, + thread_id, + *device_community_impl, + real_device_it->second,send_entry_buffer, + recv_entry_buffer, + count, + dtype, + reduction); + */ + } + else { + auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); + if (virtual_device_it != virtual_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_virtual_gpu_comm, group_id>; + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id, + thread_id, + *device_community_impl, + virtual_device_it->second,send_entry_buffer, + recv_entry_buffer, + count, + dtype, + reduction); + */ + } + } + } + } + + //if sched is not ready - send NULL + if (schedule) { + LOG_DEBUG("Device group finalized"); + } + return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule))); } /* alltoall */ diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp index 72dc9a11c..3f06af5e6 100644 --- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp @@ -83,126 +83,14 @@ ccl::event process_a2a_communicator::allreduce_impl(const buffer_type* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(); - - const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); - const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); - - auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>(); - (void)ipc_gpu_storage; - auto& in_process_ipc_source_real_gpu_storage = - community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>(); - auto& in_process_ipc_source_virtual_gpu_storage = - community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>(); - - allied_process_group_scheduler::thread_schedule_ptr schedule; - //source for collective operation is ipc sources, real gpu or virtual gpu - auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank); - if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, - ccl_ipc_source_gpu_comm<ccl_gpu_comm>, - group_id>; - - schedule = - ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id, - thread_id, - *device_community_impl, - ipc_src_real_it->second, - send_entry_buffer, - recv_entry_buffer, - count, - reduction); -*/ - } - else { - auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank); - if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, - ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>, - group_id>; - - schedule = - ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id, - thread_id, - *device_community_impl, - ipc_src_virt_it->second, - send_entry_buffer, - recv_entry_buffer, - count, - reduction); -*/ - } - else { - auto real_device_it = in_process_gpu_storage.find(comm_rank); - if (real_device_it != in_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id, - thread_id, - *device_community_impl, - real_device_it->second,send_entry_buffer, - recv_entry_buffer, - count, - reduction); -*/ - } - else { - auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); - if (virtual_device_it != virtual_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; - - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id, - thread_id, - *device_community_impl, - virtual_device_it->second,send_entry_buffer, - recv_entry_buffer, - count, - reduction); - */ - } - } - } - } - - //if sched is not ready - send NULL - if (schedule) { - LOG_DEBUG("Device group finalized"); - } - return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule))); + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> diff --git a/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp b/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp index ac49c3984..0984c5911 100644 --- a/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp +++ b/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp @@ -17,10 +17,9 @@ #include "common/comm/l0/devices/devices_declaration.hpp" #include "common/comm/l0/device_community.hpp" -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm> struct communication_process_device_expander { template <class device_t, class... Args> @@ -33,7 +32,7 @@ struct communication_process_device_expander { if (comm_device) { LOG_DEBUG("Invoke: ", comm_device->to_string()); - using gpu_entry = algorithm<kernel_params, device_t, group_id>; + using gpu_entry = algorithm<device_t, group_id>; schedule = ctx->scheduler_impl ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>( @@ -48,13 +47,14 @@ struct communication_process_device_expander { std::shared_ptr<ccl_gpu_sched> schedule; }; -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm, class... Args> std::unique_ptr<ccl::event_impl> do_collective_op( + // TODO: can we avoid using device_variant here? Because it creates an instantiation of entry for each device which + // makes it slow to compile native::device_variant_t<native::ccl_gpu_comm, native::ccl_virtual_gpu_comm, native::ccl_ipc_source_gpu_comm<native::ccl_gpu_comm>, @@ -70,7 +70,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op( size_t thread_id, native::ccl_driver_context_ptr native_context, Args&&... args) { - communication_process_device_expander<kernel_params, group_id, class_id, algorithm> expander; + communication_process_device_expander<group_id, class_id, algorithm> expander; ccl_tuple_for_each_args(communication_device, expander, ctx, @@ -85,97 +85,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op( return std::unique_ptr<ccl::event_impl>( new ccl::gpu_shared_event_impl(std::move(expander.schedule))); } - -template <class buffer_type, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> - class algorithm, - class... Args> -std::unique_ptr<ccl::event_impl> do_collective_op_reductions( - ccl::reduction reduction, - native::device_variant_t<native::ccl_gpu_comm, - native::ccl_virtual_gpu_comm, - native::ccl_ipc_source_gpu_comm<native::ccl_gpu_comm>, - native::ccl_ipc_source_gpu_comm<native::ccl_virtual_gpu_comm>, - native::ccl_numa_proxy<native::ccl_gpu_comm>, - native::ccl_numa_proxy<native::ccl_virtual_gpu_comm>, - native::ccl_scaleout_proxy<native::ccl_gpu_comm>, - native::ccl_scaleout_proxy<native::ccl_virtual_gpu_comm>>& - communication_device, - std::shared_ptr<native::process_group_context>& ctx, - typename native::device_community_container<class_id>::element_type community, - size_t process_id, - size_t thread_id, - native::ccl_driver_context_ptr native_context, - Args&&... args) { - switch (reduction) { - case ccl::reduction::sum: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - process_id, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::prod: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - process_id, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::min: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - process_id, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::max: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - process_id, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - // TODO: make support of custom reduction in *.cl - // case ccl::reduction::custom: - // return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>, - // group_id, class_id, algorithm>( - // communication_device, - // ctx, - // community, - // process_id, - // thread_id, - // native_context, - // std::forward<Args>(args)...); - // break; - default: - throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + - "Obtained reduction by user is incorrect!"); - } -} diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp index 5f96958cd..4c79f883e 100644 --- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp +++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp @@ -62,8 +62,52 @@ ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + send_count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, send_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_allgatherv, dtype); + + return do_collective_op<group_id, class_id, l0_allgatherv_typed_entry>( + communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + send_entry_buffer, + send_count, + recv_entry_buffer, + recv_counts.data(), + params, + stream); } ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf, size_t send_count, @@ -87,8 +131,52 @@ ccl::event process_ring_communicator::allreduce_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index: ", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + // TODO: we can get dtype value from buffer_type template, no need to introduce a new parameter + const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_allreduce_typed_entry>( + communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + count, + params, + stream); } /* alltoall */ @@ -122,8 +210,55 @@ ccl::event process_ring_communicator::alltoallv_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::alltoallv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + size_t total_send_counts = std::accumulate(std::begin(send_counts), std::end(send_counts), 0); + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + total_send_counts * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + + size_t total_recv_counts = std::accumulate(std::begin(recv_counts), std::end(recv_counts), 0); + ccl_buffer recv_entry_buffer( + &recv_buf, total_recv_counts * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_alltoallv, dtype); + + return do_collective_op<group_id, class_id, l0_alltoallv_typed_entry>( + communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + send_entry_buffer, + send_counts.data(), + total_send_counts, + recv_entry_buffer, + recv_counts.data(), + total_recv_counts, + params, + stream); } ccl::event process_ring_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf, const ccl::vector_class<size_t>& send_counts, @@ -146,8 +281,46 @@ ccl::event process_ring_communicator::broadcast_impl(void* buf, const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer entry_buffer( + &buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_bcast, dtype); + + return do_collective_op<group_id, class_id, l0_bcast_typed_entry>(communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + entry_buffer, + count, + root, + params, + stream); } /* reduce */ @@ -160,8 +333,52 @@ ccl::event process_ring_communicator::reduce_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_reduce_typed_entry>(communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + count, + reduction, + root, + params, + stream); } /* reduce_scatter */ @@ -174,8 +391,51 @@ ccl::event process_ring_communicator::reduce_scatter_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + recv_count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, recv_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_reduce_scatter, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_reduce_scatter_typed_entry>( + communication_device, + ctx, + community, + process_id, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + recv_count, + params, + stream); } /* sparse_allreduce */ diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp index ecb1c7377..efbe5c801 100644 --- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp +++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp @@ -15,6 +15,7 @@ */ #pragma once #include "common/comm/l0/communicator/typed_base_communicator.hpp" +#include "common/comm/usm_visitor/usm_visitors.hpp" namespace native { struct process_group_context; diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp index b9a59d522..889cb32a3 100644 --- a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp @@ -33,8 +33,14 @@ ccl::event process_ring_communicator::allgatherv_impl(const buffer_type* send_bu const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + return allgatherv_impl(static_cast<const void*>(send_buf), + send_count, + static_cast<void*>(recv_buf), + recv_counts, + ccl::native_type_info<buffer_type>::dtype, + stream, + attr, + deps); } template <class buffer_type> @@ -84,166 +90,14 @@ ccl::event process_ring_communicator::allreduce_impl(const buffer_type* send_buf const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index: ", - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op_reductions<buffer_type, group_id, class_id, l0_allreduce_typed_entry>( - reduction, - communication_device, - ctx, - community, - process_id, - thread_id, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); - - /* - - const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); - const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); - - auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>(); - (void)ipc_gpu_storage; - auto& in_process_ipc_source_real_gpu_storage = - community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>(); - auto& in_process_ipc_source_virtual_gpu_storage = - community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>(); - - allied_process_group_scheduler::thread_schedule_ptr schedule; - //source for collective operation is ipc sources, real gpu or virtual gpu - auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank); - if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string()); - - using gpu_allreduce_entry = - l0_allreduce_typed_entry<buffer_type, ccl_ipc_source_gpu_comm<ccl_gpu_comm>, group_id>; - - schedule = - ctx->scheduler_impl - ->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>( - process_id, - thread_id, - *community, - ipc_src_real_it->second, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); - } - else { - auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank); - if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string()); - - using gpu_allreduce_entry = - l0_allreduce_typed_entry<buffer_type, - ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>, - group_id>; - - schedule = - ctx->scheduler_impl - ->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>( - process_id, - thread_id, - *community, - ipc_src_virt_it->second, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); - } - else { - auto real_device_it = in_process_gpu_storage.find(comm_rank); - if (real_device_it != in_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); - - using gpu_allreduce_entry = - l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, - ccl_sched_add_back, - group_id, - class_id>(process_id, - thread_id, - *community, - real_device_it->second, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); - } - else { - auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); - if (virtual_device_it != virtual_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); - using gpu_allreduce_entry = - l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, - ccl_sched_add_back, - group_id, - class_id>(process_id, - thread_id, - *community, - virtual_device_it->second, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); - } - } - } - } - - //if sched is not ready - send NULL - if (schedule) { - LOG_DEBUG("Device group finalized"); - } - return std::unique_ptr<ccl::event_impl>( - new ccl::gpu_shared_event_impl(std::move(schedule))); - */ + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> @@ -313,9 +167,16 @@ ccl::event process_ring_communicator::alltoallv_impl(const buffer_type* send_buf const ccl::stream::impl_value_t& stream, const ccl::alltoallv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + return alltoallv_impl(static_cast<const void*>(send_buf), + send_counts, + static_cast<void*>(recv_buf), + recv_counts, + ccl::native_type_info<buffer_type>::dtype, + stream, + attr, + deps); } + template <class buffer_type> ccl::event process_ring_communicator::alltoallv_impl( const ccl::vector_class<buffer_type*>& send_buf, @@ -363,8 +224,13 @@ ccl::event process_ring_communicator::broadcast_impl(buffer_type* buf, const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + return broadcast_impl(static_cast<void*>(buf), + count, + ccl::native_type_info<buffer_type>::dtype, + root, + stream, + attr, + deps); } template <class buffer_type> @@ -388,8 +254,15 @@ ccl::event process_ring_communicator::reduce_impl(const buffer_type* send_buf, const ccl::stream::impl_value_t& stream, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + return reduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + root, + stream, + attr, + deps); } template <class buffer_type> @@ -414,8 +287,14 @@ ccl::event process_ring_communicator::reduce_scatter_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + return reduce_scatter_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + recv_count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> ccl::event process_ring_communicator::reduce_scatter_impl( diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp index 416d23ecc..52b714ecd 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp +++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp @@ -97,8 +97,78 @@ ccl::event thread_device_group_a2a_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); - return {}; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(); + + const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); + const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); + + auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>(); + (void)ipc_gpu_storage; + + thread_group_scheduler::thread_schedule_ptr schedule; + //source for collective operation is real gpu or virtual gpu + auto real_device_it = in_process_gpu_storage.find(comm_rank); + if (real_device_it != in_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id, + *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id), + real_device_it->second,send_entry_buffer, + recv_entry_buffer, + count, + reduction); + */ + } + else { + auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); + if (virtual_device_it != virtual_process_gpu_storage.end()) { + LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); + /* + using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; + + + schedule = + ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id, + *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id), + virtual_device_it->second, send_entry_buffer, + recv_entry_buffer, + count, + reduction); + */ + } + } + + //if sched is not ready - send NULL + if (schedule) { + LOG_DEBUG("Device group finalized"); + } + return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule))); } /* alltoall */ diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp index ba6f63ef5..54bf28168 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp @@ -100,78 +100,14 @@ ccl::event thread_device_group_a2a_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(); - - const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>(); - const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>(); - - auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>(); - (void)ipc_gpu_storage; - - thread_group_scheduler::thread_schedule_ptr schedule; - //source for collective operation is real gpu or virtual gpu - auto real_device_it = in_process_gpu_storage.find(comm_rank); - if (real_device_it != in_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", real_device_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>; - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id, - *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id), - real_device_it->second,send_entry_buffer, - recv_entry_buffer, - count, - reduction); - */ - } - else { - auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank); - if (virtual_device_it != virtual_process_gpu_storage.end()) { - LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string()); - /* - using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>; - - - schedule = - ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id, - *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id), - virtual_device_it->second, send_entry_buffer, - recv_entry_buffer, - count, - reduction); - */ - } - } - - //if sched is not ready - send NULL - if (schedule) { - LOG_DEBUG("Device group finalized"); - } - return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule))); + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } /* alltoall */ diff --git a/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp b/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp index b7ebb7da7..1e8123435 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp +++ b/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp @@ -17,10 +17,9 @@ #include "common/comm/l0/devices/devices_declaration.hpp" #include "common/comm/l0/device_community.hpp" -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm> struct communication_thread_device_expander { template <class device_t, class... Args> @@ -32,7 +31,7 @@ struct communication_thread_device_expander { if (comm_device) { LOG_DEBUG("Invoke: ", comm_device->to_string()); - using gpu_entry = algorithm<kernel_params, device_t, group_id>; + using gpu_entry = algorithm<device_t, group_id>; schedule = ctx->scheduler_impl ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>( @@ -43,10 +42,9 @@ struct communication_thread_device_expander { std::shared_ptr<ccl_gpu_sched> schedule; }; -template <class kernel_params, - ccl::group_split_type group_id, +template <ccl::group_split_type group_id, ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> + template <class, ccl::group_split_type> class algorithm, class... Args> std::unique_ptr<ccl::event_impl> do_collective_op( @@ -57,7 +55,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op( size_t thread_id, native::ccl_driver_context_ptr native_context, Args&&... args) { - communication_thread_device_expander<kernel_params, group_id, class_id, algorithm> expander; + communication_thread_device_expander<group_id, class_id, algorithm> expander; ccl_tuple_for_each_args(communication_device, expander, ctx, @@ -71,84 +69,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op( return std::unique_ptr<ccl::event_impl>( new ccl::gpu_shared_event_impl(std::move(expander.schedule))); } - -template <class buffer_type, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - template <class, class, ccl::group_split_type> - class algorithm, - class... Args> -std::unique_ptr<ccl::event_impl> do_collective_op_reductions( - ccl::reduction reduction, - native::device_variant_t<native::ccl_gpu_comm, native::ccl_virtual_gpu_comm>& - communication_device, - std::shared_ptr<native::thread_group_context>& ctx, - typename native::device_community_container<class_id>::element_type community, - size_t thread_id, - native::ccl_driver_context_ptr native_context, - Args&&... args) { - switch (reduction) { - case ccl::reduction::sum: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::prod: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::min: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - case ccl::reduction::max: - return do_collective_op< - kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>, - group_id, - class_id, - algorithm>(communication_device, - ctx, - community, - thread_id, - native_context, - std::forward<Args>(args)...); - break; - // TODO: make support of custom reduction in *.cl - // case ccl::reduction::custom: - // return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>, - // group_id, class_id, algorithm>( - // communication_device, - // ctx, - // community, - // thread_id, - // native_context, - // std::forward<Args>(args)...); - // break; - default: - throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + - "Obtained reduction by user is incorrect!"); - } -} diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp index a379ac937..a64625156 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp +++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp @@ -71,10 +71,51 @@ ccl::event thread_device_group_ring_communicator::allgatherv_impl( const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - allgather_visitor_t::visit( - req, dtype, send_buf, send_count, recv_buf, recv_counts, stream, attr, deps); - return req; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + send_count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, send_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + coll_param_gpu params(ccl_coll_allgatherv, dtype); + + return do_collective_op<group_id, class_id, l0_allgatherv_typed_entry>( + communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + send_entry_buffer, + send_count, + recv_entry_buffer, + recv_counts.data(), + params, + stream); } ccl::event thread_device_group_ring_communicator::allgatherv_impl( const void* send_buf, @@ -100,10 +141,50 @@ ccl::event thread_device_group_ring_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - allreduce_visitor_t::visit( - req, dtype, send_buf, recv_buf, count, reduction, stream, attr, deps); - return req; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_allreduce_typed_entry>( + communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + count, + params, + stream); } /* alltoall */ @@ -115,9 +196,8 @@ ccl::event thread_device_group_ring_communicator::alltoall_impl( const ccl::stream::impl_value_t& stream, const ccl::alltoall_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - alltoall_visitor_t::visit(req, dtype, send_buf, recv_buf, count, stream, attr, deps); - return req; + throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented"); + return {}; } ccl::event thread_device_group_ring_communicator::alltoall_impl( const ccl::vector_class<void*>& send_buf, @@ -141,10 +221,55 @@ ccl::event thread_device_group_ring_communicator::alltoallv_impl( const ccl::stream::impl_value_t& stream, const ccl::alltoallv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - alltoallv_visitor_t::visit( - req, dtype, send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps); - return req; + using namespace native; + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + size_t total_send_counts = std::accumulate(std::begin(send_counts), std::end(send_counts), 0); + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + total_send_counts * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + + size_t total_recv_counts = std::accumulate(std::begin(recv_counts), std::end(recv_counts), 0); + ccl_buffer recv_entry_buffer( + &recv_buf, total_recv_counts * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + coll_param_gpu params(ccl_coll_alltoallv, dtype); + + return do_collective_op<group_id, class_id, l0_alltoallv_typed_entry>( + communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + send_entry_buffer, + send_counts.data(), + total_send_counts, + recv_entry_buffer, + recv_counts.data(), + total_recv_counts, + params, + stream); } ccl::event thread_device_group_ring_communicator::alltoallv_impl( const ccl::vector_class<void*>& send_buf, @@ -169,9 +294,45 @@ ccl::event thread_device_group_ring_communicator::broadcast_impl( const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - broadcast_visitor_t::visit(req, dtype, buf, count, root, stream, attr, deps); - return req; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer entry_buffer( + &buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + coll_param_gpu params(ccl_coll_bcast, dtype); + + return do_collective_op<group_id, class_id, l0_bcast_typed_entry>(communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + entry_buffer, + count, + root, + params, + stream); } /* reduce */ @@ -185,10 +346,51 @@ ccl::event thread_device_group_ring_communicator::reduce_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - reduce_visitor_t::visit( - req, dtype, send_buf, recv_buf, count, reduction, root, stream, attr, deps); - return req; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + coll_param_gpu params(ccl_coll_reduce, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_reduce_typed_entry>(communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + count, + reduction, + root, + params, + stream); } /* reduce_scatter */ @@ -201,10 +403,50 @@ ccl::event thread_device_group_ring_communicator::reduce_scatter_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { - ccl::event req; - reduce_scatter_visitor_t::visit( - req, dtype, send_buf, recv_buf, recv_count, reduction, stream, attr, deps); - return req; + using namespace native; + + static constexpr ccl::group_split_type group_id = base_t::topology_type(); + static constexpr ccl::device_topology_type class_id = base_t::topology_class(); + + if (!is_ready()) { + throw ccl::exception(std::string( + "Device communicator for group_id: " + ::to_string(group_id) + + " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); + } + + int comm_rank = rank(); + size_t ring_index = 0; + LOG_DEBUG("communicator for device idx: ", + get_device_path(), + ", rank idx: ", + comm_rank, + ", ring_index :", + ring_index); + + //TODO make const! + ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf), + recv_count * ccl::get_datatype_size(dtype), + 0, + ccl_buffer_type::INDIRECT); + ccl_buffer recv_entry_buffer( + &recv_buf, recv_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT); + + using community_t = typename device_community_container<class_id>::element_type; + community_t community = device_community_impl.get_topology(ring_index); + + coll_param_gpu params(ccl_coll_reduce_scatter, dtype, reduction); + + return do_collective_op<group_id, class_id, l0_reduce_scatter_typed_entry>( + communication_device, + ctx, + community, + thread_id, + this->get_native_context(), + send_entry_buffer, + recv_entry_buffer, + recv_count, + params, + stream); } /* sparse_allreduce */ diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp index ac8cdc425..04b93440a 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp +++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp @@ -26,14 +26,7 @@ class thread_device_group_ring_communicator : public typed_base_communicator<thread_device_group_ring_communicator, ccl::group_split_type::process, ccl::device_topology_type::ring, - ccl::gpu_communicator_traits>, - public allgather_usm_visitor<thread_device_group_ring_communicator>, - public allreduce_usm_visitor<thread_device_group_ring_communicator>, - public alltoall_usm_visitor<thread_device_group_ring_communicator>, - public alltoallv_usm_visitor<thread_device_group_ring_communicator>, - public broadcast_usm_visitor<thread_device_group_ring_communicator>, - public reduce_usm_visitor<thread_device_group_ring_communicator>, - public reduce_scatter_usm_visitor<thread_device_group_ring_communicator> { + ccl::gpu_communicator_traits> { public: using base_t = typed_base_communicator<thread_device_group_ring_communicator, ccl::group_split_type::process, @@ -46,15 +39,6 @@ class thread_device_group_ring_communicator native::ccl_numa_proxy<native::ccl_gpu_comm>, native::ccl_numa_proxy<native::ccl_virtual_gpu_comm>*/>; - using allgather_visitor_t = allgather_usm_visitor<thread_device_group_ring_communicator>; - using allreduce_visitor_t = allreduce_usm_visitor<thread_device_group_ring_communicator>; - using alltoall_visitor_t = alltoall_usm_visitor<thread_device_group_ring_communicator>; - using alltoallv_visitor_t = alltoallv_usm_visitor<thread_device_group_ring_communicator>; - using broadcast_visitor_t = broadcast_usm_visitor<thread_device_group_ring_communicator>; - using reduce_visitor_t = reduce_usm_visitor<thread_device_group_ring_communicator>; - using reduce_scatter_visitor_t = - reduce_scatter_usm_visitor<thread_device_group_ring_communicator>; - thread_device_group_ring_communicator(ccl::unified_device_type&& device, ccl::unified_context_type&& ctx, size_t thread_idx, diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp index 880ef5b23..c2b646d4f 100644 --- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp +++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp @@ -21,6 +21,7 @@ #include "common/comm/l0/devices/devices_declaration.hpp" #include "common/comm/l0/device_community.hpp" #include "common/comm/l0/context/thread_group_ctx.hpp" +// TODO: try to move to cpp file as we now only reference l0_entries from there #include "common/comm/l0/scheduler/thread_group_scheduler.hpp" #include "common/event/impls/gpu_event.hpp" #include "common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp" @@ -75,50 +76,14 @@ ccl::event thread_device_group_ring_communicator::allgatherv_impl( const ccl::stream::impl_value_t& stream, const ccl::allgatherv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - send_count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, send_count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op<kernel_params_default<buffer_type>, - group_id, - class_id, - l0_allgatherv_typed_entry>(communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - send_entry_buffer, - send_count, - recv_entry_buffer, - recv_counts.data(), - stream); + return allgatherv_impl(static_cast<const void*>(send_buf), + send_count, + static_cast<void*>(recv_buf), + recv_counts, + ccl::native_type_info<buffer_type>::dtype, + stream, + attr, + deps); } /* allreduce */ @@ -131,49 +96,14 @@ ccl::event thread_device_group_ring_communicator::allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op_reductions<buffer_type, group_id, class_id, l0_allreduce_typed_entry>( - reduction, - communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - stream); + return allreduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> @@ -285,49 +215,14 @@ ccl::event thread_device_group_ring_communicator::alltoallv_impl( const ccl::stream::impl_value_t& stream, const ccl::alltoallv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - size_t SIZE = 512; - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - SIZE * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, SIZE * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op<kernel_params_default<buffer_type>, - group_id, - class_id, - l0_alltoallv_typed_entry>(communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - send_entry_buffer, - send_counts.data(), - recv_entry_buffer, - recv_counts.data(), - stream); + return alltoallv_impl(static_cast<const void*>(send_buf), + send_counts, + static_cast<void*>(recv_buf), + recv_counts, + ccl::native_type_info<buffer_type>::dtype, + stream, + attr, + deps); } /* bcast */ @@ -339,44 +234,13 @@ ccl::event thread_device_group_ring_communicator::broadcast_impl( const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - - //TODO make const! - ccl_buffer entry_buffer(&buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op<kernel_params_default<buffer_type>, - group_id, - class_id, - l0_bcast_typed_entry>(communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - entry_buffer, - count, - root, - stream); + return broadcast_impl(static_cast<void*>(buf), + count, + ccl::native_type_info<buffer_type>::dtype, + root, + stream, + attr, + deps); } template <class buffer_type> @@ -402,50 +266,15 @@ ccl::event thread_device_group_ring_communicator::reduce_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op_reductions<buffer_type, group_id, class_id, l0_reduce_typed_entry>( - reduction, - communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - count, - reduction, - root, - stream); + return reduce_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + root, + stream, + attr, + deps); } template <class buffer_type> @@ -472,51 +301,14 @@ ccl::event thread_device_group_ring_communicator::reduce_scatter_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace native; - - static constexpr ccl::group_split_type group_id = base_t::topology_type(); - static constexpr ccl::device_topology_type class_id = base_t::topology_class(); - - if (!is_ready()) { - throw ccl::exception(std::string( - "Device communicator for group_id: " + ::to_string(group_id) + - " is not ready yet. Not all сommunicators are created in group. Please create them before usage")); - } - - int comm_rank = rank(); - size_t ring_index = 0; - LOG_DEBUG("communicator for device idx: ", - get_device_path(), - ", rank idx: ", - comm_rank, - ", ring_index :", - ring_index); - - //TODO make const! - ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf), - recv_count * sizeof(buffer_type), - 0, - ccl_buffer_type::INDIRECT); - ccl_buffer recv_entry_buffer( - &recv_buf, recv_count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT); - - using community_t = typename device_community_container<class_id>::element_type; - community_t community = device_community_impl.get_topology(ring_index); - - return do_collective_op_reductions<buffer_type, - group_id, - class_id, - l0_reduce_scatter_typed_entry>(reduction, - communication_device, - ctx, - community, - thread_id, - this->get_native_context(), - send_entry_buffer, - recv_entry_buffer, - recv_count, - reduction, - stream); + return reduce_scatter_impl(static_cast<const void*>(send_buf), + static_cast<void*>(recv_buf), + recv_count, + ccl::native_type_info<buffer_type>::dtype, + reduction, + stream, + attr, + deps); } template <class buffer_type> diff --git a/src/common/comm/l0/context/base_ctx_actor.hpp b/src/common/comm/l0/context/base_ctx_actor.hpp index f8c773fb7..d7425de3c 100644 --- a/src/common/comm/l0/context/base_ctx_actor.hpp +++ b/src/common/comm/l0/context/base_ctx_actor.hpp @@ -36,6 +36,7 @@ struct actor { actor(key_t actor_id, Function&& f, Args&&... args) : function(std::bind(std::forward<Function>(f), std::forward<Args>(args)..., + this, std::placeholders::_1)), stop(false), processing(&actor<message_type>::run, this), @@ -61,6 +62,17 @@ struct actor { } } +protected: + template <class Derived, class Function, class... Args> + actor(Derived* child, key_t actor_id, Function&& f, Args&&... args) + : function(std::bind(std::forward<Function>(f), + std::forward<Args>(args)..., + child, + std::placeholders::_1)), + stop(false), + processing(&actor<message_type>::run, this), + id(actor_id) {} + private: core_t function; storage_t messages; @@ -72,8 +84,8 @@ struct actor { key_t id; virtual void run() { + storage_t to_do_list; while (!stop.load()) { - storage_t to_do_list; { std::unique_lock<std::mutex> lk(mutex); condition.wait(lk, [this]() { @@ -105,7 +117,7 @@ struct subscribed_actor : public actor<message_type> { template <class Function, class... Args> subscribed_actor(key_t actor_id, Function&& f, Args&&... args) - : base_t(actor_id, std::forward<Function>(f), std::forward<Args>(args)..., this) {} + : base_t(this, actor_id, std::forward<Function>(f), std::forward<Args>(args)...) {} virtual ~subscribed_actor() {} diff --git a/src/common/comm/l0/context/base_scaling_ctx.hpp b/src/common/comm/l0/context/base_scaling_ctx.hpp index 97394a071..d855c8219 100644 --- a/src/common/comm/l0/context/base_scaling_ctx.hpp +++ b/src/common/comm/l0/context/base_scaling_ctx.hpp @@ -34,6 +34,9 @@ namespace observer { template <class device_t, class actor_t> using device_thread_map = std::map<device_t*, std::unique_ptr<actor_t>>; +template <class actor_t, class... devices_types> +using multiple_device_thread_map_t = std::tuple<device_thread_map<devices_types, actor_t>...>; + template <class device_t> using proxy_observer_ptr = typename std::add_pointer<device_t>::type; diff --git a/src/common/comm/l0/context/device_group_ctx.cpp b/src/common/comm/l0/context/device_group_ctx.cpp index 9cf32b832..b6746911f 100644 --- a/src/common/comm/l0/context/device_group_ctx.cpp +++ b/src/common/comm/l0/context/device_group_ctx.cpp @@ -16,7 +16,7 @@ #include <sstream> #include "common/comm/l0/devices/devices_declaration.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp" #include "common/comm/l0/context/device_group_ctx.hpp" #include "common/comm/l0/context/device_storage.hpp" #include "common/comm/l0/topology/ring/device_group_ring_creator.hpp" diff --git a/src/common/comm/l0/context/device_group_ctx.hpp b/src/common/comm/l0/context/device_group_ctx.hpp index 9c814ee2c..f0fa9ddc0 100644 --- a/src/common/comm/l0/context/device_group_ctx.hpp +++ b/src/common/comm/l0/context/device_group_ctx.hpp @@ -22,7 +22,7 @@ #include "oneapi/ccl/types.hpp" #include "supported_topologies.hpp" #include "common/comm/l0/gpu_comm_attr.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx.hpp" #include "common/comm/l0/device_community_holder_impl.hpp" class device_group_router; diff --git a/src/common/comm/l0/context/process_group_ctx.cpp b/src/common/comm/l0/context/process_group_ctx.cpp index b178c1071..7c7d00c5d 100644 --- a/src/common/comm/l0/context/process_group_ctx.cpp +++ b/src/common/comm/l0/context/process_group_ctx.cpp @@ -35,10 +35,10 @@ #include "common/comm/l0/scheduler/allied_process_group_scheduler.hpp" #include "common/comm/host_communicator/host_communicator.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp" +#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp" +#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp" namespace native { @@ -151,19 +151,15 @@ bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_ detail::adjacency_matrix p2p_dependency_graph = ally_process_topology.build_p2p_capability_matrix(ss, node_mask); ss << "\nMatrix\n" << p2p_dependency_graph << std::endl; - /* TODO -S- enaled it later - if (!ally_process_topology.build_all(ss, - comm_addr, - thread_group_ctx->get_thread_group_device_indices(), - p2p_dependency_graph)) { - LOG_ERROR(ss.str(), "\nCannot build ipc ring! Abort. Build Log:\n", ss.str()); + + if (!ally_process_topology.build_all( + ss, thread_group_ctx->get_thread_group_device_indices(), p2p_dependency_graph)) { + LOG_ERROR( + ss.str(), "\nCannot build cluster global ring! Abort. Build Log:\n", ss.str()); abort(); } -*/ - if (!ally_process_topology.build_all( - ss, thread_group_ctx->get_thread_group_device_indices(), p2p_dependency_graph)) - LOG_DEBUG("Build IPC ring succesfully. Log:\n", ss.str()); + LOG_DEBUG("Build cluster global ring successfully. Log:\n", ss.str()); } { @@ -183,6 +179,9 @@ bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_ LOG_INFO("initialize IPC context"); get_ipc_ctx().initialize_ctx(ccl_communicator); + LOG_INFO("initialize SCALE-OUT context"); + get_scaleout_ctx().initialize_ctx(ccl_communicator); + // dump topology std::stringstream out; dump_process_topologies(out); diff --git a/src/common/comm/l0/context/process_group_ctx.hpp b/src/common/comm/l0/context/process_group_ctx.hpp index cbe563040..15791f7b2 100644 --- a/src/common/comm/l0/context/process_group_ctx.hpp +++ b/src/common/comm/l0/context/process_group_ctx.hpp @@ -15,12 +15,12 @@ */ #pragma once #include "common/comm/l0/context/thread_group_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx.hpp" +#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp" +#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp" +#include "common/comm/l0/context/scale/scaling_context_dispatcher.hpp" #include "common/comm/l0/topology/topology_declarations.hpp" namespace ccl { class host_communicator; diff --git a/src/common/comm/l0/context/scale/base/base_session.cpp b/src/common/comm/l0/context/scale/base/base_session.cpp new file mode 100644 index 000000000..7e26961bc --- /dev/null +++ b/src/common/comm/l0/context/scale/base/base_session.cpp @@ -0,0 +1,101 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "oneapi/ccl/native_device_api/l0/base_impl.hpp" +#include "oneapi/ccl/native_device_api/l0/primitives.hpp" +#include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp" + +#include "common/comm/l0/context/scale/base/base_session.hpp" + +namespace native { +namespace observer { + +void context_descr::init_host_dev_fields() { + host_mem_producer = nullptr; + host_mem_producer_counter = nullptr; + host_consumed_bytes = 0; + host_expected_bytes = 0; + + dev_mem_consumer = nullptr; + dev_mem_consumer_counter = nullptr; + device_produced_bytes = 0; +} + +void context_descr::init(size_t staged_buffer_elem_count, + size_t observer_domain_index, + size_t observer_domain_count, + std::shared_ptr<ccl_context>& context, + ccl_device& device) { + // set all fields by 0 + init_host_dev_fields(); + + /* HOST */ + // create staged mem in host context (Host memory allocation descriptor) + ze_host_mem_alloc_desc_t host_descr = ccl_context::get_default_host_alloc_desc(); + host_descr.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED; + + // host mem buf + host_mem_producer = context->template alloc_memory<uint8_t>( + staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype()), + /*TODO use page size*/ ccl::get_datatype_size(kernel_params.get_datatype()), + host_descr); + + // create staged mem counter in host context (host mem buf counter) + host_mem_producer_counter = context->template alloc_memory<counter_t>( + 1, /*TODO use page size*/ sizeof(counter_t), host_descr); + + host_expected_bytes = + staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype()); + + /* DEVICE */ + ze_device_mem_alloc_desc_t mem_descr = ccl_device::get_default_mem_alloc_desc(); + + // create total aggregated memory in device context + mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED; + dev_mem_consumer = device.template alloc_memory_ptr<uint8_t>( + (staged_buffer_elem_count * observer_domain_count) * + ccl::get_datatype_size(kernel_params.get_datatype()), + ccl::get_datatype_size(kernel_params.get_datatype()), + context, + mem_descr); + + // create offset in device context + mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED; + producer_aggregated_memory_offset = + device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr); + + // create aggregated counter in device context + dev_mem_consumer_counter = + device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr); + + /* COUNTERS */ + reset_counters(observer_domain_index, observer_domain_count); +} + +void context_descr::reset_counters(size_t observer_domain_index, size_t observer_domain_count) { + counter_t filled_counter_value = 0; + + host_mem_producer_counter->enqueue_write_sync(&filled_counter_value, 1); + + filled_counter_value = observer_domain_index * host_mem_producer->count(); + + producer_aggregated_memory_offset->enqueue_write_sync(&filled_counter_value, 1); + + filled_counter_value = 0; + dev_mem_consumer_counter->enqueue_write_sync(&filled_counter_value, 1); +} + +} // namespace observer +} // namespace native diff --git a/src/common/comm/l0/context/scale/base/base_session.hpp b/src/common/comm/l0/context/scale/base/base_session.hpp new file mode 100644 index 000000000..fea9590b7 --- /dev/null +++ b/src/common/comm/l0/context/scale/base/base_session.hpp @@ -0,0 +1,164 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once +#include <functional> +#include <string> +#include <vector> + +#include "oneapi/ccl.hpp" +#include "oneapi/ccl/native_device_api/l0/device.hpp" +#include "oneapi/ccl/native_device_api/l0/context.hpp" + +#include "coll/algorithms/algorithms_enum.hpp" +#include "common/comm/l0/modules/supported_modules.hpp" +#include "coll/coll_param.hpp" + +namespace native { +namespace observer { +using counter_t = uint64_t; + +struct producer_description { + size_t rank; + size_t comm_size; + counter_t staged_buffer_elem_count; + + std::shared_ptr<ccl_context> context; + ccl_device& device; + ccl_device::device_cmd_list immediate_list; //TODO make persisten +}; + +struct context_descr { + context_descr(const coll_param_gpu& kernel_params) : kernel_params(kernel_params) {} + + using host_mem_ptr_t = ccl_context::host_memory_ptr<uint8_t>; + using host_mem_ptr_cntr_t = ccl_context::host_memory_ptr<counter_t>; + using dev_mem_ptr_t = ccl_device::device_memory_ptr<uint8_t>; + using dev_mem_ptr_cntr_t = ccl_device::device_memory_ptr<counter_t>; + + // produced by kernel + host_mem_ptr_t host_mem_producer; + host_mem_ptr_cntr_t host_mem_producer_counter; + size_t host_consumed_bytes; + size_t host_expected_bytes; + + // consumed by kernel + dev_mem_ptr_t dev_mem_consumer; + dev_mem_ptr_cntr_t dev_mem_consumer_counter; + size_t device_produced_bytes; + + // (TODO consider using 'recv_buff' from collective entry) + // to reduce copy iterations + // TODO: rename + dev_mem_ptr_cntr_t producer_aggregated_memory_offset; + + void init_host_dev_fields(); + + void init(size_t staged_buffer_elem_count, + size_t observer_domain_index, + size_t observer_domain_count, + std::shared_ptr<ccl_context>& context, + ccl_device& device); + + void reset_counters(size_t observer_domain_index, size_t observer_domain_count); + +private: + // TODO: can we guarantee that this object is not destroyed before invoke_params and + // use const& here? + coll_param_gpu kernel_params; +}; + +template <ccl_coll_type coll_type> +struct invoke_params { + static constexpr ccl_coll_type get_coll_type() { + return coll_type; + } + + invoke_params(producer_description&& in_producer_params, const coll_param_gpu& kernel_params) + : in_params(std::move(in_producer_params)), + kernel_params(kernel_params), + out_params(kernel_params), + valid(false) {} + + void set_out_params(const context_descr& src) { + out_params = src; + valid = true; + } + + bool is_valid() const { + return valid; + } + + const producer_description& get_producer_params() const { + return in_params; + } + + producer_description& get_producer_params() { + return in_params; + } + + const coll_param_gpu& get_kernel_params() const { + return kernel_params; + } + + const context_descr& get_ctx_params() const { + if (!is_valid()) { + throw std::runtime_error("observer invocation params are not ready"); + } + return out_params; + } + +private: + producer_description in_params; + // TODO: can we guarantee that this object is not destroyed before l0 entry and + // use const& here? + coll_param_gpu kernel_params; + context_descr out_params; + bool valid; +}; + +struct session_key { + using hash_core_t = size_t; + + friend std::ostream& operator<<(std::ostream& out, const session_key& key) { + out << key.to_string(); + return out; + } + + template <class T> + session_key(const T* src) : hash(std::hash<const T*>{}(src)) {} + + bool operator<(const session_key& other) const noexcept { + return hash < other.hash; + } + + std::string to_string() const { + return std::to_string(hash); + } + +private: + hash_core_t hash; +}; + +struct session_notification { + session_notification(void* addr, size_t size_bytes) + : host_src_ptr(addr), + src_size_bytes(size_bytes) {} + void* host_src_ptr; + size_t src_size_bytes; +}; + +} // namespace observer +} // namespace native diff --git a/src/common/comm/l0/context/scale/base/base_session_table.hpp b/src/common/comm/l0/context/scale/base/base_session_table.hpp new file mode 100644 index 000000000..574127381 --- /dev/null +++ b/src/common/comm/l0/context/scale/base/base_session_table.hpp @@ -0,0 +1,76 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once +#include <atomic> +#include <map> +#include <memory> + +#include "common/comm/l0/context/scale/base/base_session.hpp" +#include "common/comm/l0/modules/supported_modules.hpp" + +namespace native { +namespace observer { + +// session owner, not thread-safe +template <class session_interface> +struct session_table { + using session_key_t = session_key; + using session_interface_t = session_interface; + using session_interface_ptr_t = std::shared_ptr<session_interface_t>; + + template <template <ccl::device_topology_type, class...> class specific_session, + ccl::device_topology_type class_id, + class invoke_params_type> + session_interface_ptr_t create_session(const session_key_t& key, + invoke_params_type& params, + size_t observer_domain_index, + size_t observer_domain_count) { + using specific_session_impl = specific_session<class_id, invoke_params_type>; + + static_assert(std::is_base_of<session_interface_t, specific_session_impl>::value, + "Relationship IS-A `specific_session` for `session_interface_t` failed"); + + auto sess = std::make_shared<specific_session_impl>(params.get_producer_params(), + params.get_kernel_params(), + observer_domain_index, + observer_domain_count, + key); + + params.set_out_params(sess->get_ctx_descr()); + sessions.emplace(key, sess); + + return sess; + } + + size_t get_unique_tag() { + static std::atomic<size_t> tag_counter{ 1 }; + return tag_counter.fetch_add(1); + } + + std::string to_string() const { + std::stringstream ss; + ss << "sessions count: " << sessions.size() << std::endl; + for (const auto& val : sessions) { + ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n" + << val.second->to_string() << std::endl; + } + return ss.str(); + } + + std::map<session_key_t, session_interface_ptr_t> sessions{}; +}; +} // namespace observer +} //namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp similarity index 90% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp index 3307840a9..fcee3dee6 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp @@ -22,8 +22,8 @@ #include <thread> #include <vector> #include "common/comm/l0/context/base_scaling_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp" namespace ccl { class host_communicator; @@ -163,14 +163,20 @@ class ipc_ctx : public observer::base_scaling_ctx<ipc_ctx<Impl, types...>, abort(); } + // TODO: WA: destroy all sessions that were before + // (only one session is always active) + // without this WA, we hang in kernels when reusing sessions + // because other sessions have the same key accidentally. + // It will works for GPU cache enabled but invalid without cache + table->sessions.clear(); + std::shared_ptr<session> sess; - LOG_DEBUG("session_key: ", - session_key.to_string(), - ", current sessions count: ", - table->sessions.size()); auto session_it = table->sessions.find(session_key); if (session_it == table->sessions.end()) { - //create new session + LOG_DEBUG("create new session session_key: ", + session_key.to_string(), + ", current sessions count: ", + table->sessions.size()); const auto& comm_addr = observer_ptr->template get_comm_data<ccl::group_split_type::cluster, ccl::device_topology_type::ring>(); @@ -184,6 +190,10 @@ class ipc_ctx : public observer::base_scaling_ctx<ipc_ctx<Impl, types...>, else { //renew existing sess = session_it->second; + LOG_DEBUG("session reuse: session_key: ", + session_key.to_string(), + ", current sessions count: ", + table->sessions.size()); } append_session_for_processing(session_key, sess); diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp similarity index 96% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp index 673b1b975..35fdafe5d 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp @@ -14,10 +14,10 @@ limitations under the License. */ #pragma once -#include "common/comm/l0/context/scaling_ctx/ipc_ctx.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp" #include "common/utils/tuple.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp" #include "common/log/log.hpp" #include "common/comm/host_communicator/host_communicator.hpp" #include "common/comm/l0/devices/communication_structs/ipc_client.hpp" @@ -238,16 +238,19 @@ void ipc_ctx<TEMPLATE_DEF_ARG>::listener(ccl_ipc_gpu_comm* listener_device) { { std::unique_lock<std::mutex> lk(delivery_mutex); delivery_condition.wait(lk, [this]() { - return !processing_queue.empty(); + return !processing_queue.empty() || stop.load(); }); sessions_to_execute.splice(sessions_to_execute.end(), processing_queue); } - LOG_DEBUG("Sessions for processing: ", sessions_to_execute.size()); + LOG_DEBUG("Sessions for processing: ", + sessions_to_execute.size(), + " stop flag status: ", + stop.load()); for (auto sess_it = sessions_to_execute.begin(); sess_it != sessions_to_execute.end() and !stop.load();) { - shared_session_ptr sess = *sess_it; + shared_session_ptr_t sess = *sess_it; // try restore IPC handles LOG_DEBUG("process session: ", sess->to_string()); diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp similarity index 97% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp index caf56e411..baa3b0539 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp @@ -14,8 +14,8 @@ limitations under the License. */ #include <sstream> -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp" #include "common/log/log.hpp" #include "common/comm/host_communicator/host_communicator.hpp" @@ -118,6 +118,7 @@ bool session::process(const ccl_ipc_gpu_comm* indexed_ipc_dst_devices, LOG_ERROR("Cannot recover IPC handle by index: ", num_handles, ", error:\n", ex.what()); throw; } + num_handles++; } // handles received diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp similarity index 68% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp index cf0f98445..73e015f8b 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp @@ -17,8 +17,9 @@ #include <atomic> #include <map> #include <memory> -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp" +#include "coll/coll_param.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp" #include "common/comm/l0/modules/supported_modules.hpp" namespace ccl { @@ -77,16 +78,18 @@ class session { std::atomic<bool> finished; }; -using shared_session_ptr = std::shared_ptr<session>; +using shared_session_ptr_t = std::shared_ptr<session>; /* High level session * Contains collective communication data */ -template <ccl_coll_type coll_type, class kernel_params, ccl::device_topology_type class_id> +template <ccl_coll_type coll_type, ccl::device_topology_type class_id> struct typed_ipc_session : public session { typed_ipc_session(origin_ipc_memory_container&& ipc_src_memory_handles, - size_t source_ipc_device_rank) - : session(std::move(ipc_src_memory_handles), source_ipc_device_rank) {} + size_t source_ipc_device_rank, + const coll_param_gpu& kernel_params) + : session(std::move(ipc_src_memory_handles), source_ipc_device_rank), + kernel_params(kernel_params) {} void visit(const ccl_ipc_gpu_comm* source, native::supported_device_modules<ipc_dst_device_coll_module>& ipc_modules) override { @@ -99,9 +102,8 @@ struct typed_ipc_session : public session { assert(module_ptr); // get appropriate kernel - auto& kernel = module_ptr->template get_class<typename module_t::main_class>() - .template get<kernel_params>(); - using kernel_t = typename std::decay<decltype(kernel)>::type; + auto& kernel = + module_ptr->template get_class<typename module_t::main_class>().get(kernel_params); // get recovered ipc handles auto data_it = data_to_recover.ipc_memory_storage.find(source); @@ -110,23 +112,11 @@ struct typed_ipc_session : public session { } // bind data - const recovered_handles_storage::restored_ipc_memory_container& ipc_handles = - data_it->second; - typename kernel_t::tmp_recv_buf_arg_type tmp_recv_buf = - reinterpret_cast<typename kernel_t::tmp_recv_buf_arg_type>( - ipc_handles.at(0).get().pointer); - kernel.template set_arg<typename kernel_t::tmp_recv_buf_arg>(tmp_recv_buf); - - typename kernel_t::income_data_flag_arg_type inc = - reinterpret_cast<typename kernel_t::income_data_flag_arg_type>( - ipc_handles.at(1).get().pointer); - kernel.template set_arg<typename kernel_t::income_data_flag_arg>(inc); - - typename kernel_t::ready_to_recv_flag_arg_type ready = - reinterpret_cast<typename kernel_t::ready_to_recv_flag_arg_type>( - ipc_handles.at(2).get().pointer); - kernel.template set_arg<typename kernel_t::ready_to_recv_flag_arg>(ready); + const auto& ipc_handles = data_it->second; + kernel.bind_data(ipc_handles); } + + coll_param_gpu kernel_params; }; // session owner @@ -140,11 +130,10 @@ struct session_table { const std::string& peer_addr, ipc_invoke_params_type&& params, size_t source_device_rank) { - using specific_session = typed_ipc_session<ipc_invoke_params_type::get_coll_type(), - typename ipc_invoke_params_type::kernel_params_t, - class_id>; - auto sess = - std::make_shared<specific_session>(std::move(params.handles), source_device_rank); + using specific_session = + typed_ipc_session<ipc_invoke_params_type::get_coll_type(), class_id>; + auto sess = std::make_shared<specific_session>( + std::move(params.handles), source_device_rank, params.get_kernel_params()); sessions.emplace(key, sess); start_session(sess, client, peer_addr); @@ -152,7 +141,7 @@ struct session_table { } std::string to_string() const; - std::map<session_key_t, shared_session_ptr> sessions{}; + std::map<session_key_t, shared_session_ptr_t> sessions{}; static size_t get_unique_tag(); @@ -162,5 +151,4 @@ struct session_table { const std::string& peer_addr); }; -using shared_session_table_ptr = std::shared_ptr<session_table>; } // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp similarity index 96% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp index 2699e6b21..0233e7faf 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #include "common/log/log.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp" #include "common/comm/l0/devices/devices_declaration.hpp" namespace native { diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp similarity index 100% rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp similarity index 92% rename from src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp rename to src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp index d05cd3dd0..6acabdfa8 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp" namespace native { diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp similarity index 78% rename from src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp rename to src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp index 831558bcf..42e36548e 100644 --- a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp +++ b/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp @@ -20,21 +20,29 @@ #include "oneapi/ccl/native_device_api/l0/device.hpp" #include "coll/algorithms/algorithms_enum.hpp" +#include "coll/coll_param.hpp" namespace native { -template <ccl_coll_type type, class kernel_params> +template <ccl_coll_type type> struct ipc_invoke_params { - using kernel_params_t = kernel_params; - - ipc_invoke_params(std::vector<ccl_device::device_ipc_memory_handle>&& h) - : handles(std::move(h)) {} + ipc_invoke_params(std::vector<ccl_device::device_ipc_memory_handle>&& h, + const coll_param_gpu& params) + : handles(std::move(h)), + params{ params } {} static constexpr ccl_coll_type get_coll_type() { return type; } + const coll_param_gpu& get_kernel_params() const { + return params; + } + std::vector<ccl_device::device_ipc_memory_handle> handles; + // TODO: can we guarantee that this object is not destroyed before l0 entry and + // use const& here? + coll_param_gpu params; }; struct ipc_session_key { diff --git a/src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx.hpp similarity index 74% rename from src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp rename to src/common/comm/l0/context/scale/numa/numa_ctx.hpp index 745f04a6c..7a1b30ae0 100644 --- a/src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp +++ b/src/common/comm/l0/context/scale/numa/numa_ctx.hpp @@ -15,8 +15,9 @@ */ #pragma once #include "common/comm/l0/context/base_scaling_ctx.hpp" -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" -#include "common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" +#include "common/comm/l0/context/scale/base/base_session_table.hpp" +#include "common/comm/l0/context/scale/numa/numa_session.hpp" namespace native { @@ -26,10 +27,12 @@ class ccl_virtual_gpu_comm; template <class device> class ccl_numa_proxy; +#define NUMA_CTX_DEVICE_PROXY_TYPES(observer_type) \ + observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm> + template <class Impl, ccl::device_topology_type... types> class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, - ccl_numa_proxy<ccl_gpu_comm>, - ccl_numa_proxy<ccl_virtual_gpu_comm>> { + NUMA_CTX_DEVICE_PROXY_TYPES(ccl_numa_proxy)> { public: static_assert(sizeof...(types), "types must be not 0"); using context_impl = Impl; @@ -38,11 +41,14 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, using observer_t = ccl_numa_proxy<device_t>; using scaling_ctx_base_t = observer::base_scaling_ctx<numa_ctx<Impl, types...>, - observer_t<ccl_gpu_comm>, - observer_t<ccl_virtual_gpu_comm>>; + NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>; + + using session_t = observer::numa_session_iface; //TODO: numa_session + using session_ptr_t = std::shared_ptr<session_t>; + using base_session_table_t = observer::session_table<session_t>; + using base_session_table_ptr_t = std::shared_ptr<base_session_table_t>; - using numa_actor = observer::subscribed_actor<std::shared_ptr<observer::session>, - observer::session_notification>; + using numa_actor = observer::subscribed_actor<session_ptr_t, observer::session_notification>; using observable_scale_up_topologies = typename scaling_ctx_base_t::template observable_topologies<types...>; @@ -56,25 +62,25 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, // session data template <class NUMA_source_device_t, ccl_coll_type coll_type> struct device_session_data { - std::map<NUMA_source_device_t*, std::shared_ptr<observer::session_table>> source_sessions; + std::map<NUMA_source_device_t*, base_session_table_ptr_t> source_sessions; }; //TODO make table PER thread!!! - template <ccl_coll_type coll_type> - using session_table_t = - std::tuple<device_session_data<observer_t<ccl_gpu_comm>, coll_type>, - device_session_data<observer_t<ccl_virtual_gpu_comm>, coll_type>>; + template <ccl_coll_type coll_type, class... devices_types> + using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>; template <ccl_coll_type... coll_type> - using session_table_typed_storage_t = std::tuple<session_table_t<coll_type>...>; + using session_table_typed_storage_t = + std::tuple<session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>...>; struct session_table_initializer { template <ccl_coll_type coll_type, class device_t> - void operator()(session_table_t<coll_type>& table, observer_t<device_t>* observer_ptr) { + void operator()(session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>& table, + observer_t<device_t>* observer_ptr) { auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table); sessions_table.source_sessions.emplace( - observer_ptr, std::make_shared<observer::session_table>(observer::session_table{})); + observer_ptr, std::make_shared<base_session_table_t>(base_session_table_t{})); } }; @@ -112,6 +118,11 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, //Try to find existing session owner for coll type auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>( std::get<coll_type>(collective_sessions)); + + // In general way sessions_table.source_sessions.find(observer_ptr) has multithreading access, + // But it has write access only in wire up-phase, when observers are inserted from topology construction + // Multithreading access here is served by model "multiple-readers - no writers" + // and can be used without mutex protection auto session_table_it = sessions_table.source_sessions.find(observer_ptr); if (session_table_it == sessions_table.source_sessions.end()) { std::stringstream ss; @@ -128,13 +139,13 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, abort(); } - std::shared_ptr<observer::session_table> table = session_table_it->second; + base_session_table_ptr_t table = session_table_it->second; if (!table) { LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort"); abort(); } - std::shared_ptr<observer::session> sess; + session_ptr_t sess; LOG_DEBUG("session_key: ", sess_key.to_string(), ", current sessions count: ", @@ -142,7 +153,7 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, auto session_it = table->sessions.find(sess_key); if (session_it == table->sessions.end()) { //create new session - sess = table->create_session<class_id>( + sess = table->template create_session<observer::numa_session, class_id>( sess_key, param, registered_index, registered_devices_count); } else { @@ -175,10 +186,9 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>, template <ccl::device_topology_type topology_type, class device_t> void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr); - using devices_tuple_thread_map = - std::tuple<observer::device_thread_map<observer_t<ccl_gpu_comm>, numa_actor>, - observer::device_thread_map<observer_t<ccl_virtual_gpu_comm>, numa_actor>>; - devices_tuple_thread_map numa_workers; + using specific_device_tuple_thread_map_t = + observer::multiple_device_thread_map_t<numa_actor, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>; + specific_device_tuple_thread_map_t numa_workers; template <class device_t> void worker(observer_t<device_t>* device, diff --git a/src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp similarity index 94% rename from src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp rename to src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp index d3c6fa68b..2e3d3021e 100644 --- a/src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp +++ b/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx.hpp" #include "common/utils/tuple.hpp" #include "common/log/log.hpp" @@ -107,10 +107,10 @@ void numa_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device, size_t partial_chunk_size = 0; // get own device partial chunk data - if ((*sess_it)->produce_data(&partial_chunk, partial_chunk_size)) { + (*sess_it)->produce_data(&partial_chunk, partial_chunk_size); + if (partial_chunk_size > 0) { // notify other actor for data_ready - observer::detail::actor_publisher<std::shared_ptr<observer::session>, - observer::session_notification> + observer::detail::actor_publisher<session_ptr_t, observer::session_notification> visitor; ccl_tuple_for_each_args(numa_workers, visitor, @@ -130,8 +130,9 @@ void numa_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device, actor_index % total_actors_count, (*sess_it)->get_send_tag(), messages); for (auto mess_it = messages.begin(); mess_it != messages.end(); ++mess_it) { - session_finished = (*sess_it)->consume_data( + (*sess_it)->consume_data( 0 /*TODO !!!! */, mess_it->host_src_ptr, mess_it->src_size_bytes); + session_finished = (*sess_it)->is_consumed(); assert(not(session_finished && std::next(mess_it, 1) != messages.end()) && "Session are filled too early"); } diff --git a/src/common/comm/l0/context/scale/numa/numa_session.hpp b/src/common/comm/l0/context/scale/numa/numa_session.hpp new file mode 100644 index 000000000..d7f6f799a --- /dev/null +++ b/src/common/comm/l0/context/scale/numa/numa_session.hpp @@ -0,0 +1,187 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include "common/comm/l0/context/scale/base/base_session.hpp" + +namespace native { +namespace observer { + +class numa_session_iface { +public: + numa_session_iface(session_key key) : sess_key(key) {} + virtual ~numa_session_iface() = default; + + size_t get_send_tag() const { + return send_tag; + } + + const session_key& get_session_key() const { + return sess_key; + } + + std::string to_string() const { + std::stringstream ss; + ss << "session key identifier: " << get_session_key(); + return ss.str(); + } + + virtual void prepare(size_t observer_domain_index, + size_t observer_domain_count, + void* type_erased_param) = 0; + + virtual void produce_data(void** out_chunk, size_t& out_chunk_size) = 0; + virtual void consume_data(size_t observer_domain_index, + void* in_chunk, + size_t in_chunk_size) = 0; + virtual bool is_consumed() noexcept = 0; + virtual bool is_produced() noexcept = 0; + +private: + size_t send_tag{}; + session_key sess_key; +}; + +/* High level session + * Contains collective communication data + */ +template <ccl::device_topology_type class_id, class session_invoke_params> +struct numa_session : public numa_session_iface { + using invoke_params_t = session_invoke_params; + using session_key_t = session_key; + + numa_session(producer_description& in_param, + const coll_param_gpu& kernel_params, + size_t observer_domain_index, + size_t observer_domain_count, + const session_key_t& key) + : numa_session_iface(key), + kernel_params(kernel_params), + ctx_descr(kernel_params), + copy_immediate_list(std::move(in_param.immediate_list)) { + ctx_descr.init(in_param.staged_buffer_elem_count, + observer_domain_index, + observer_domain_count, + in_param.context, + in_param.device); + } + + context_descr& get_ctx_descr() { + return ctx_descr; + } + + const coll_param_gpu& get_kernel_params() const { + return kernel_params; + } + + void prepare(size_t observer_domain_index, + size_t observer_domain_count, + void* type_erased_param) override { + auto* out_param = static_cast<invoke_params_t*>(type_erased_param); + ctx_descr.reset_counters(observer_domain_index, observer_domain_count); + + out_param->set_out_params(ctx_descr); + } + + void produce_data(void** out_chunk, size_t& out_chunk_size) override { + size_t old_consumed = get_ctx_descr().host_consumed_bytes; + uint64_t total_produced = *get_ctx_descr().host_mem_producer_counter->get(); + + size_t to_consume = total_produced - old_consumed; + if (to_consume) { + //fence + LOG_TRACE(to_string(), + " - bytes produced: ", + total_produced, + ", previously bytes consumed: ", + old_consumed); + std::atomic_thread_fence(std::memory_order::memory_order_seq_cst); // TODO: why? + + // do not read data here! + *out_chunk = + static_cast<void*>(get_ctx_descr().host_mem_producer->get() + old_consumed); + + // update host_consumed_bytes + get_ctx_descr().host_consumed_bytes += to_consume; + } + + // TODO: set logging here + out_chunk_size = to_consume; + } + + void consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size) override { + /* TODO create event + * ze_event_handle_t mem_event {}; + */ + + auto device_consumer_ready_bytes = get_ctx_descr().dev_mem_consumer_counter->get(); + auto device_produced_bytes = get_ctx_descr().device_produced_bytes; + + // TODO: set logging here + + // copy buffer from host to device + ze_result_t res = zeCommandListAppendMemoryCopy( + copy_immediate_list.get(), + static_cast<void*>(get_ctx_descr().dev_mem_consumer->get() + device_produced_bytes), + in_chunk, + in_chunk_size, + /*mem_event*/ nullptr, + 0, + nullptr); + if (res != ZE_RESULT_SUCCESS) { + throw std::runtime_error( + std::string( + "cannot append copy NUMA host to device memory for partial result, error: ") + + native::to_string(res)); + } + device_produced_bytes += in_chunk_size; + get_ctx_descr().device_produced_bytes = device_produced_bytes; + + // TODO: set logging here + // copy size from host to device + res = zeCommandListAppendMemoryCopy(copy_immediate_list.get(), + device_consumer_ready_bytes, + &device_produced_bytes, + sizeof(device_produced_bytes), + nullptr, + 0, + /*&mem_event*/ nullptr); + if (res != ZE_RESULT_SUCCESS) { + throw std::runtime_error( + std::string( + "cannot append copy NUMA host to device memory for ready bytes, error: ") + + native::to_string(res)); + } + } + + bool is_consumed() noexcept override { + return (get_ctx_descr().device_produced_bytes * + ccl::get_datatype_size(get_kernel_params().get_datatype())) == + get_ctx_descr().host_consumed_bytes; + } + + bool is_produced() noexcept override { + return get_ctx_descr().host_expected_bytes == get_ctx_descr().host_consumed_bytes; + } + +private: + coll_param_gpu kernel_params; + context_descr ctx_descr; + ccl_device::device_cmd_list copy_immediate_list; +}; + +} // namespace observer +} // namespace native diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp new file mode 100644 index 000000000..7d1302c33 --- /dev/null +++ b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp @@ -0,0 +1,214 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once +#include "common/comm/l0/context/base_scaling_ctx.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" +#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp" +#include "common/comm/l0/context/scale/base/base_session_table.hpp" + +namespace ccl { +class host_communicator; +} + +namespace native { + +class ccl_gpu_comm; +class ccl_virtual_gpu_comm; + +template <class device> +class ccl_scaleout_proxy; + +template <class device> +class ccl_gpu_scaleup_proxy; + +template <class device> +class ccl_numa_proxy; + +#define SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_type) \ + observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm>, \ + observer_type<ccl_numa_proxy<ccl_gpu_comm>>, \ + observer_type<ccl_numa_proxy<ccl_virtual_gpu_comm>>, \ + observer_type<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, \ + observer_type<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, \ + observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, \ + observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>> + +template <class Impl, ccl::device_topology_type... types> +class scale_out_ctx + : public observer::base_scaling_ctx<scale_out_ctx<Impl, types...>, + SCALE_OUT_CTX_DEVICE_PROXY_TYPES(ccl_scaleout_proxy)> { +public: + using context_impl = Impl; + + template <class device_t> + using observer_t = ccl_scaleout_proxy<device_t>; + + using scaling_ctx_base_t = + observer::base_scaling_ctx<scale_out_ctx<Impl, types...>, + SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>; + + using session_t = observer::scale_out_session_iface; + using session_ptr_t = std::shared_ptr<session_t>; + using spec_session_table_t = observer::session_table<session_t>; + using spec_session_table_ptr_t = std::shared_ptr<spec_session_table_t>; + + using scaleout_actor = observer::actor<session_ptr_t>; + + using observable_scale_up_topologies = + typename scaling_ctx_base_t::template observable_topologies<types...>; + using indexed_observable_topologies = + typename scaling_ctx_base_t::template indexed_observable_topologies<types...>; + + observable_scale_up_topologies observables; + indexed_observable_topologies indexed_observables; + + // session data + template <class scaleout_source_device_t, ccl_coll_type coll_type> + struct device_session_data { + std::map<scaleout_source_device_t*, spec_session_table_ptr_t> source_sessions; + }; + + //TODO make table PER thread!!! + template <ccl_coll_type coll_type, class... devices_types> + using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>; + + template <ccl_coll_type... coll_type> + using session_table_typed_storage_t = + std::tuple<session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>...>; + + struct session_table_initializer { + template <ccl_coll_type coll_type, class device_t> + void operator()( + session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>& table, + observer_t<device_t>* observer_ptr) { + auto& sessions_table = + ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table); + sessions_table.source_sessions.emplace( + observer_ptr, std::make_shared<spec_session_table_t>(spec_session_table_t{})); + } + }; + + session_table_typed_storage_t<CCL_COLL_LIST> collective_sessions; + + void initialize_ctx(std::shared_ptr<ccl::host_communicator> communicator); + + //observer subject interface implementations + template <class device_t, ccl::device_topology_type topology_type> + void attach_ctx_observer(size_t rank_addr, + observer_t<device_t>* observer_ptr, + std::integral_constant<ccl::device_topology_type, topology_type> val) { + register_observer_impl<topology_type>(rank_addr, observer_ptr); + } + + template <class device_t, ccl::device_topology_type class_id, class invoke_params_t> + void invoke_ctx_observer(observer_t<device_t>* observer_ptr, + std::integral_constant<ccl::device_topology_type, class_id> val, + const observer::session_key& sess_key, + invoke_params_t& param) { + // sanity - check registered proxy + observer::container_t<observer_t<device_t>>& container = + scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>( + observables); + + auto it = container.find(observer_ptr); + if (it == container.end()) { + throw std::runtime_error(std::string("ScaleOut Observer is not registered: ") + + observer_ptr->to_string() + + " total count: " + std::to_string(container.size())); + } + size_t registered_index = std::distance(container.begin(), it); + + static constexpr ccl_coll_type coll_type = invoke_params_t::get_coll_type(); + //Try to find existing session owner for coll type + auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>( + std::get<coll_type>(collective_sessions)); + auto session_table_it = sessions_table.source_sessions.find(observer_ptr); + if (session_table_it == sessions_table.source_sessions.end()) { + std::stringstream ss; + ss << "sessions count: " << sessions_table.source_sessions.size() << std::endl; + for (const auto& val : sessions_table.source_sessions) { + ss << val.first->to_string() << ", " << val.second->to_string() << std::endl; + } + LOG_ERROR("session_key: ", + sess_key.to_string(), + ", cannot find source session for device: ", + observer_ptr->to_string(), + ". Available keys: ", + ss.str()); + abort(); + } + + auto table = session_table_it->second; + if (!table) { + LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort"); + abort(); + } + + session_ptr_t sess; + LOG_DEBUG("session_key: ", + sess_key.to_string(), + ", current sessions count: ", + table->sessions.size()); + auto session_it = table->sessions.find(sess_key); + if (session_it == table->sessions.end()) { + //create new session + sess = table->template create_session<observer::scale_out_session, class_id>( + sess_key, param, registered_index, registered_devices_count); + } + else { + //renew existing + sess = session_it->second; + sess->prepare( + registered_index, registered_devices_count, reinterpret_cast<void*>(¶m)); + + //param.reset_counters(registered_index, container.size()); + } + + // notify actor-owner + const auto& thread_map = + ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>( + scaleout_workers); + auto actor_it = thread_map.find(observer_ptr); + if (actor_it == thread_map.end()) { + LOG_ERROR("Unregistered observer: ", + observer_ptr->to_string(), + ", thread_map size: ", + thread_map.size(), + " . Abort"); + abort(); + } + + actor_it->second->start_job(sess); + } + +private: + template <ccl::device_topology_type class_id, class device_t> + void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr); + + using specific_devices_tuple_thread_map = + observer::multiple_device_thread_map_t<scaleout_actor, + SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>; + specific_devices_tuple_thread_map scaleout_workers; + + template <class device_t> + void worker(observer_t<device_t>* device, + scaleout_actor* actor_ptr, + typename scaleout_actor::storage_t& todo_list); + size_t registered_devices_count{}; + + std::shared_ptr<ccl::host_communicator> process_communicator; +}; +} // namespace native diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp new file mode 100644 index 000000000..39aed531f --- /dev/null +++ b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp @@ -0,0 +1,134 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once +#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp" +#include "common/log/log.hpp" +#include "common/comm/host_communicator/host_communicator.hpp" + +namespace native { + +#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types +#define TEMPLATE_DEF_ARG Impl, types... + +template <TEMPLATE_DECL_ARG> +void scale_out_ctx<TEMPLATE_DEF_ARG>::initialize_ctx( + std::shared_ptr<ccl::host_communicator> communicator) { + process_communicator = communicator; + + LOG_INFO("SCALE-OUT context Initialized for mpi rank: (", + std::to_string(communicator->rank()), + "/", + std::to_string(communicator->size()), + ")"); +} + +// observer_ptr interface implementations +template <TEMPLATE_DECL_ARG> +template <ccl::device_topology_type class_id, class device_t> +void scale_out_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr, + observer_t<device_t>* observer_ptr) { + LOG_INFO("scaleout device rank addr: ", + std::to_string(rank_addr), + ", device: ", + observer_ptr->to_string()); + observer::container_t<observer_t<device_t>>& container = + scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>( + observables); + auto cont_it = container.find(observer_ptr); + if (cont_it == container.end()) { + container.insert(observer_ptr); + // remember total count + registered_devices_count++; + + // prepare session tables + session_table_initializer init; + ccl_tuple_for_each_args(collective_sessions, init, observer_ptr); + + if (rank_addr == std::numeric_limits<size_t>::max()) { + return; //nothing to do more + } + } + + //reassign with index + assert(rank_addr != std::numeric_limits<size_t>::max() && + "Reassign with assigned address failed"); + + observer::indexed_container_t<observer_t<device_t>>& indexed_container = + scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>( + indexed_observables); + + auto indexed_it = indexed_container.find(rank_addr); + if (indexed_it != indexed_container.end()) { + // collect troubleshooting info + std::stringstream ss; + for (const auto& indexed_dev : indexed_container) { + ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string() + << "\n"; + } + throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + + "- Cannot reassing rank: " + std::to_string(rank_addr) + + " for SCALEOUT device:\n" + observer_ptr->to_string() + + "\nBecause it registered already:\n" + ss.str()); + } + + indexed_container.emplace(rank_addr, observer_ptr); + + // start SCALEOUT worker + auto& thread_map = + ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>( + scaleout_workers); + { + std::unique_ptr<scaleout_actor> new_actor{ new scaleout_actor( + rank_addr, &scale_out_ctx<TEMPLATE_DEF_ARG>::worker<device_t>, this, observer_ptr) }; + thread_map[observer_ptr] = std::move(new_actor); + } +} + +template <TEMPLATE_DECL_ARG> +template <class device_t> +void scale_out_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device, + scaleout_actor* actor_ptr, + typename scaleout_actor::storage_t& todo_list) { + LOG_DEBUG("Start SCALEOUT context worker, Listener device: ", + listener_device->to_string(), + ",\nactor_id: ", + actor_ptr->get_id(), + ",\ntodo list size: ", + todo_list.size()); + + // invoke CPU collective on data chunk + for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) { + session_ptr_t sess = *sess_it; + + sess->produce_data(process_communicator); + ++sess_it; + } + + // check CPU collective accomplishment + for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) { + (*sess_it)->consume_data(0 /*TODO !!!! */, process_communicator); + if ((*sess_it)->is_consumed()) { + sess_it = todo_list.erase(sess_it); + } + else { + ++sess_it; + } + } +} + +#undef TEMPLATE_DECL_ARG +#undef TEMPLATE_DEF_ARG +} // namespace native diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp new file mode 100644 index 000000000..5f27b6845 --- /dev/null +++ b/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp @@ -0,0 +1,58 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include <sstream> + +#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp" +#include "common/log/log.hpp" +#include "common/comm/host_communicator/host_communicator.hpp" + +namespace native { +namespace observer { + +std::string scale_out_session_iface::to_string() const { + std::stringstream ss; + ss << "sess: " << reinterpret_cast<const void*>(this); + return ss.str(); +} + +size_t scale_out_session_iface::get_send_tag() const { + return send_tag; +} + +void ccl_worker_adapter::submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm, + const session_notification& in, + session_notification_handle& out, + const coll_param_gpu& kernel_params) { + // allreduce + if (kernel_params.get_coll_type() == ccl_coll_allreduce) { + out.output_buffer.resize(in.src_size_bytes); + ccl::stream::impl_value_t empty_stream{}; + + // notice: not thread-safe + out.op_handle = comm->allreduce_impl(in.host_src_ptr, + out.output_buffer.data(), + in.src_size_bytes, + kernel_params.get_datatype(), + kernel_params.get_reduction(), + empty_stream, + ccl::default_allreduce_attr, + {}); + out.op_handle_ready = true; + } +} + +} // namespace observer +} // namespace native diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp new file mode 100644 index 000000000..43716108c --- /dev/null +++ b/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp @@ -0,0 +1,171 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include "oneapi/ccl/types.hpp" +#include "oneapi/ccl/type_traits.hpp" +#include "oneapi/ccl/types_policy.hpp" + +#include "oneapi/ccl/event.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" +#include "common/comm/l0/context/scale/numa/numa_session.hpp" + +namespace ccl { +class host_communicator; +} + +namespace native { +namespace observer { + +class scale_out_session_iface { +public: + scale_out_session_iface() = default; + virtual ~scale_out_session_iface() = default; + + size_t get_send_tag() const; + std::string to_string() const; + + virtual void prepare(size_t observer_domain_index, + size_t observer_domain_count, + void* type_erased_param) = 0; + virtual void produce_data(std::shared_ptr<ccl::host_communicator>& comm) = 0; + virtual void consume_data(size_t observer_domain_index, + std::shared_ptr<ccl::host_communicator>& comm) = 0; + virtual bool is_consumed() noexcept = 0; + virtual bool is_produced() noexcept = 0; + +private: + size_t send_tag{}; +}; + +struct session_notification_handle { + using notification_handle_t = ccl::event; + //using notification_handle_ptr_t = std::unique_ptr<notification_handle_t>; + + //TODO use custom allocator instead vector + std::vector<uint8_t> output_buffer; + notification_handle_t op_handle; + //TODO + // because notification_handle_t class interface do not provide distinction + // between canceled and finished use special flag is denoted extended state of op_handle. + // USE event_impl pointer instead! Fix host_communicator to return event_impl! + bool op_handle_ready; +}; + +struct ccl_worker_adapter { + static void submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm, + const session_notification& in, + session_notification_handle& out, + const coll_param_gpu& kernel_params); +}; + +template <ccl::device_topology_type class_id, class session_invoke_params> +struct scale_out_session : public scale_out_session_iface { + using base_t = scale_out_session_iface; + using invoke_params_t = session_invoke_params; + using session_key_t = session_key; + + scale_out_session(producer_description& in_param, + const coll_param_gpu& in_kernel_params, + size_t observer_domain_index, + size_t observer_domain_count, + const session_key_t& key) + : base_t(), + proxy_session(in_param, + in_kernel_params, + observer_domain_index, + observer_domain_count, + key) { + //TODO use `session_invoke_params` information to calculate possible `pending_notifications` reserve + // based on chunk size + pending_notifications.reserve(16); + } + + context_descr& get_ctx_descr() { + return proxy_session.get_ctx_descr(); + } + + void prepare(size_t observer_domain_index, + size_t observer_domain_count, + void* type_erased_param) override { + proxy_session.prepare(observer_domain_index, observer_domain_count, type_erased_param); + + auto* out_param = static_cast<invoke_params_t*>(type_erased_param); + + // allocate cpu gw staging slots + pending_notifications.clear(); + + (void)out_param; + } + + void produce_data(std::shared_ptr<ccl::host_communicator>& comm) override { + void* partial_chunk = nullptr; + size_t partial_chunk_size = 0; + + // get own device partial chunk data + proxy_session.produce_data(&partial_chunk, partial_chunk_size); + if (partial_chunk_size > 0) { + // notify other scaleout actors in other processes about partial my result + session_notification notif(partial_chunk, partial_chunk_size); + session_notification_handle handle; + + ccl_worker_adapter::submit_coll_work( + comm, notif, handle, proxy_session.get_kernel_params()); + + pending_notifications.push_back(std::move(handle)); + } + } + + void consume_data(size_t observer_domain_index, + std::shared_ptr<ccl::host_communicator>& comm) override { + for (auto it = pending_notifications.begin(); it != pending_notifications.end(); ++it) { + if (it->op_handle_ready) { // notice: not thread-safe + + if (it->op_handle.test()) { + proxy_session.consume_data( + observer_domain_index, + it->output_buffer.data(), + it->output_buffer.size() * + ccl::get_datatype_size( + proxy_session.get_kernel_params().get_datatype())); + + // notice: not thread-safe + it->op_handle_ready = false; + } + else { + //TODO collectives on CPU side are processing sequencially + // if the first handle is not ready yet, then skip following handles + break; + } + } + } + } + + bool is_consumed() noexcept override { + return proxy_session.is_consumed(); + } + + bool is_produced() noexcept override { + return proxy_session.is_produced(); + } + +private: + void notify_data(); + numa_session<class_id, invoke_params_t> proxy_session; + std::vector<session_notification_handle> pending_notifications; +}; +} // namespace observer +} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp similarity index 100% rename from src/common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp rename to src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp diff --git a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp similarity index 97% rename from src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp rename to src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp index ba0486216..096aa722e 100644 --- a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp +++ b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp" +#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp" namespace native { #define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types diff --git a/src/common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp b/src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp similarity index 100% rename from src/common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp rename to src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp diff --git a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp b/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp deleted file mode 100644 index a3b035f7b..000000000 --- a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#include <sstream> - -#include "common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp" -#include "common/log/log.hpp" - -namespace native { -namespace observer { - -session::session() - : host_producer_memory(nullptr), - host_producer_ready_bytes(nullptr), - host_consumed_bytes(0), - host_expected_bytes(0), - - device_consumer_total_memory(nullptr), - device_consumer_ready_bytes(nullptr), - device_produced_bytes(0), - copy_immediate_list() {} - -std::string session::to_string() const { - std::stringstream ss; - ss << "sess: " << reinterpret_cast<const void*>(this); - return ss.str(); -} - -size_t session::get_send_tag() const { - return send_tag; -} - -size_t session::produce_data(void** out_chunk, size_t& out_chunk_size) { - //read ready flag - size_t old_consumed = host_consumed_bytes; - int total_produced = *host_producer_ready_bytes; - - size_t to_consume = total_produced - old_consumed; - if (to_consume) { - //fence - LOG_TRACE(to_string(), - " - bytes produced: ", - total_produced, - ", previously bytes consumed: ", - old_consumed); - std::atomic_thread_fence(std::memory_order::memory_order_seq_cst); - - // do not read data here! - *out_chunk = (static_cast<uint8_t*>(host_producer_memory) + old_consumed); - - //check finalize - host_consumed_bytes = to_consume; - } - - out_chunk_size = to_consume; - return to_consume; -} - -bool session::consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size) { - /* TODO create event - * ze_event_handle_t mem_event {}; - */ - - ze_result_t res = zeCommandListAppendMemoryCopy( - copy_immediate_list, - (static_cast<uint8_t*>(device_consumer_total_memory) + device_produced_bytes), - in_chunk, - in_chunk_size, - /*mem_event*/ nullptr, - 0, - nullptr); - if (res != ZE_RESULT_SUCCESS) { - throw std::runtime_error( - std::string( - "cannot append copy NUMA host to device memory for partial result, error: ") + - native::to_string(res)); - } - device_produced_bytes += in_chunk_size; - - res = zeCommandListAppendMemoryCopy(copy_immediate_list, - device_consumer_ready_bytes, - &device_produced_bytes, - sizeof(device_produced_bytes), - nullptr, - 1, - /*&mem_event*/ nullptr); - if (res != ZE_RESULT_SUCCESS) { - throw std::runtime_error( - std::string("cannot append copy NUMA host to device memory for ready bytes, error: ") + - native::to_string(res)); - } - return device_produced_bytes == host_expected_bytes; -} - -size_t session_table::get_unique_tag() { - static std::atomic<size_t> tag_counter{ 1 }; - return tag_counter.fetch_add(1); -} - -std::string session_table::to_string() const { - std::stringstream ss; - ss << "sessions count: " << sessions.size() << std::endl; - for (const auto& val : sessions) { - ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n" - << val.second->to_string() << std::endl; - } - return ss.str(); -} -} // namespace observer -} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp b/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp deleted file mode 100644 index 077186e65..000000000 --- a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp +++ /dev/null @@ -1,133 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#pragma once -#include <atomic> -#include <map> -#include <memory> -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" -#include "common/comm/l0/modules/supported_modules.hpp" - -namespace native { -namespace observer { - -/* Low levels session - * contains raw data for net operations - */ -class session { -public: - session(); - virtual ~session() = default; - - virtual void prepare(size_t observer_domain_index, - size_t observer_domain_count, - void* type_erased_param) = 0; - - size_t get_send_tag() const; - std::string to_string() const; - - size_t produce_data(void** out_chunk, size_t& out_chunk_size); - bool consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size); - -private: - size_t send_tag{}; - - // low level data - void* host_producer_memory; - counter_type* host_producer_ready_bytes; - size_t host_consumed_bytes; - size_t host_expected_bytes; - - void* device_consumer_total_memory; - counter_type* device_consumer_ready_bytes; - size_t device_produced_bytes; - - ze_command_list_handle_t copy_immediate_list; -}; - -struct session_notification { - session_notification(void* addr, size_t size_bytes) - : host_src_ptr(addr), - src_size_bytes(size_bytes) {} - void* host_src_ptr; - size_t src_size_bytes; -}; - -using shared_session_ptr = std::shared_ptr<session>; - -/* High level session - * Contains collective communication data - */ -template <ccl_coll_type coll_type, class kernel_params, ccl::device_topology_type class_id> -struct typed_session : public session { - typed_session(producer_description& in_param, - size_t observer_domain_index, - size_t observer_domain_count) { - params.init(in_param.staged_buffer_elem_count, - observer_domain_index, - observer_domain_count, - in_param.context, - in_param.device); - } - - const context_description<coll_type, typename kernel_params::native_type>& - get_context_description() const { - return params; - } - - void prepare(size_t observer_domain_index, - size_t observer_domain_count, - void* type_erased_param) override { - auto* out_param = static_cast<invoke_params<coll_type, kernel_params>*>(type_erased_param); - params.reset_staged_counters(observer_domain_index, observer_domain_count); - - out_param->set_out_params(params); - } - -private: - context_description<coll_type, typename kernel_params::native_type> params; -}; - -// session owner -// TODO not thread-safe -struct session_table { - using session_key_t = session_key; - - template <ccl::device_topology_type class_id, class invoke_params_type> - std::shared_ptr<session> create_session(const session_key_t& key, - invoke_params_type& params, - size_t observer_domain_index, - size_t observer_domain_count) { - using specific_session = typed_session<invoke_params_type::get_coll_type(), - typename invoke_params_type::kernel_params_t, - class_id>; - auto sess = std::make_shared<specific_session>( - params.get_producer_params(), observer_domain_index, observer_domain_count); - - params.set_out_params(sess->get_context_description()); - sessions.emplace(key, sess); - - return sess; - } - - std::string to_string() const; - std::map<session_key_t, shared_session_ptr> sessions{}; - - static size_t get_unique_tag(); -}; - -using shared_session_table_ptr = std::shared_ptr<session_table>; -} // namespace observer -} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp b/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp deleted file mode 100644 index ed0b366ce..000000000 --- a/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#pragma once -#include <functional> -#include <string> -#include <vector> - -#include "oneapi/ccl/native_device_api/l0/device.hpp" -#include "oneapi/ccl/native_device_api/l0/context.hpp" -#include "coll/algorithms/algorithms_enum.hpp" - -namespace native { -namespace observer { -using counter_type = uint64_t; -struct producer_description { - size_t world_rank; - size_t world_size; - counter_type staged_buffer_elem_count; - - std::shared_ptr<ccl_context> context; - ccl_device& device; - ccl_device::device_cmd_list immediate_list; //TODO make persisten -}; - -//TODO looks like these structure is specific for allreduce only -template <ccl_coll_type type, class native_data_type> -struct context_description { - // produced by kernel - ccl_context::host_memory_ptr<native_data_type> numa_staged_memory; - ccl_context::host_memory_ptr<counter_type> staged_memory_size_counter; - - // consumed by kernel - // (TODO consider using 'recv_buff' from collective entry) - // to reduce copy iterations - ccl_device::device_memory_ptr<counter_type> producer_aggregated_memory_offset; - - ccl_device::device_memory_ptr<native_data_type> total_producers_aggregated_memory; - ccl_device::device_memory_ptr<counter_type> total_producers_aggregated_size_counter; - - void init(size_t staged_buffer_elem_count, - size_t observer_domain_index, - size_t observer_domain_count, - std::shared_ptr<ccl_context>& context, - ccl_device& device) { - // create staged mem in host context - ze_host_mem_alloc_desc_t host_descr = ccl_context::get_default_host_alloc_desc(); - host_descr.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED; - - numa_staged_memory = context->template alloc_memory<native_data_type>( - staged_buffer_elem_count, - /*TODO use page size*/ sizeof(native_data_type), - host_descr); - - // create staged mem counter in host context - staged_memory_size_counter = context->template alloc_memory<counter_type>( - 1, /*TODO use page size*/ sizeof(counter_type), host_descr); - - ze_device_mem_alloc_desc_t mem_descr = ccl_device::get_default_mem_alloc_desc(); - - // create total aggregated memory in device context - mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED; - total_producers_aggregated_memory = device.template alloc_memory_ptr<native_data_type>( - staged_buffer_elem_count * observer_domain_count, - sizeof(native_data_type), - context, - mem_descr); - - // create offset in device context - mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED; - producer_aggregated_memory_offset = device.template alloc_memory_ptr<counter_type>( - 1, sizeof(counter_type), context, mem_descr); - - // create aggregated counter in device context - total_producers_aggregated_size_counter = device.template alloc_memory_ptr<counter_type>( - 1, sizeof(counter_type), context, mem_descr); - - // init values - reset_staged_counters(observer_domain_index, observer_domain_count); - } - - void reset_staged_counters(size_t observer_domain_index, size_t observer_domain_count) { - counter_type filled_counter_value = 0; - staged_memory_size_counter->enqueue_write_sync(&filled_counter_value, 1); - - filled_counter_value = observer_domain_index * numa_staged_memory->count(); - ; - producer_aggregated_memory_offset->enqueue_write_sync(&filled_counter_value, 1); - - filled_counter_value = 0; - total_producers_aggregated_size_counter->enqueue_write_sync(&filled_counter_value, 1); - } -}; - -template <ccl_coll_type type, class kernel_params> -struct invoke_params { - using kernel_params_t = kernel_params; - - static constexpr ccl_coll_type get_coll_type() { - return type; - } - - invoke_params(producer_description&& in) - : in_params(std::move(in)), - out_params(), - valid(false) {} - - void set_out_params( - const context_description<type, typename kernel_params_t::native_type>& src) { - out_params = src; - valid = true; - } - - bool is_valid() const { - return valid; - } - - const producer_description& get_producer_params() const { - return in_params; - } - - producer_description& get_producer_params() { - return in_params; - } - - const context_description<type, typename kernel_params_t::native_type>& get_ctx_params() const { - if (!is_valid()) { - throw std::runtime_error("observer invocation params are not ready"); - } - return out_params; - } - -private: - producer_description in_params; - context_description<type, typename kernel_params_t::native_type> out_params; - bool valid; -}; - -struct session_key { - using hash_core_t = size_t; - - friend std::ostream& operator<<(std::ostream& out, const session_key& key) { - out << key.to_string(); - return out; - } - - template <class T> - session_key(const T* src) : hash(std::hash<const T*>{}(src)) {} - - bool operator<(const session_key& other) const noexcept; - - std::string to_string() const; - -private: - hash_core_t hash; -}; -} // namespace observer -} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp b/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp deleted file mode 100644 index 89cc9852f..000000000 --- a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#pragma once -#include "common/comm/l0/context/base_scaling_ctx.hpp" - -namespace native { - -class ccl_gpu_comm; -class ccl_virtual_gpu_comm; - -template <class device> -class ccl_scaleout_proxy; - -template <class device> -class ccl_gpu_scaleup_proxy; - -template <class device> -class ccl_numa_proxy; - -template <class Impl, ccl::device_topology_type... types> -class scale_out_ctx - : public observer::base_scaling_ctx< - scale_out_ctx<Impl, types...>, - ccl_scaleout_proxy<ccl_gpu_comm>, - ccl_scaleout_proxy<ccl_virtual_gpu_comm>, - ccl_scaleout_proxy<ccl_numa_proxy<ccl_gpu_comm>>, - ccl_scaleout_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>, - ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, - ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, - ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, - ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>> { -public: - using context_impl = Impl; - - template <class device_t> - using observer_t = ccl_scaleout_proxy<device_t>; - - using scaling_ctx_base_t = observer::base_scaling_ctx< - scale_out_ctx<Impl, types...>, - observer_t<ccl_gpu_comm>, - observer_t<ccl_virtual_gpu_comm>, - observer_t<ccl_numa_proxy<ccl_gpu_comm>>, - observer_t<ccl_numa_proxy<ccl_virtual_gpu_comm>>, - observer_t<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, - observer_t<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, - observer_t<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, - observer_t<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>>; - - using observable_scale_up_topologies = - typename scaling_ctx_base_t::template observable_topologies<types...>; - - observable_scale_up_topologies observables; - - //observer subject interface implementations - template <class device_t, ccl::device_topology_type topology_type> - void attach_ctx_observer(size_t rank_addr, - observer_t<device_t>* observer_ptr, - std::integral_constant<ccl::device_topology_type, topology_type> val) { - register_observer_impl<topology_type>(rank_addr, observer_ptr); - } - - template <class device_t, ccl::device_topology_type class_id, class invoke_params_t> - void invoke_ctx_observer(observer_t<device_t>* observer_ptr, - std::integral_constant<ccl::device_topology_type, class_id> val, - const observer::session_key& sess_key, - invoke_params_t& param) { - throw std::runtime_error("SCALE_OUT invoke is not implemented yet"); - } - -private: - template <ccl::device_topology_type topology_type, class device_t> - void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr); /* - { - auto &topologu_specific_observers = std::get<topology_index>(observables); - container_t<device_t>& container = std::get<device_t::type_idx()>(topologu_specific_observers); - container.insert(observer); - }*/ -}; -} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp b/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp deleted file mode 100644 index 9d1a94873..000000000 --- a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#pragma once -#include "common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp" -#include "common/log/log.hpp" - -namespace native { - -#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types -#define TEMPLATE_DEF_ARG Impl, types... - -// observer_ptr interface implementations -template <TEMPLATE_DECL_ARG> -template <ccl::device_topology_type topology_type, class device_t> -void scale_out_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr, - observer_t<device_t>* observer_ptr) { - observer::container_t<observer_t<device_t>>& container = - scaling_ctx_base_t::template get_types_container<observer_t<device_t>, topology_type>( - observables); - container.insert(observer_ptr); -} - -#undef TEMPLATE_DECL_ARG -#undef TEMPLATE_DEF_ARG -} // namespace native diff --git a/src/common/comm/l0/context/thread_group_ctx.cpp b/src/common/comm/l0/context/thread_group_ctx.cpp index 78a76411e..a5e7aa16a 100644 --- a/src/common/comm/l0/context/thread_group_ctx.cpp +++ b/src/common/comm/l0/context/thread_group_ctx.cpp @@ -19,7 +19,7 @@ #include "common/comm/l0/context/device_storage.hpp" #include "common/comm/l0/scheduler/thread_group_scheduler.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp" namespace native { diff --git a/src/common/comm/l0/context/thread_group_ctx.hpp b/src/common/comm/l0/context/thread_group_ctx.hpp index 4cfd0acf7..3e80a235f 100644 --- a/src/common/comm/l0/context/thread_group_ctx.hpp +++ b/src/common/comm/l0/context/thread_group_ctx.hpp @@ -17,7 +17,7 @@ #include "common/comm/l0/context/device_group_ctx.hpp" #include "common/log/log.hpp" -#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx.hpp" namespace native { struct device_storage; diff --git a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp index c8a647869..d00a5dde4 100644 --- a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp +++ b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp @@ -41,12 +41,8 @@ class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>, template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; static constexpr const char* name_impl() { return "CONCURRENT_GPU"; @@ -73,11 +69,10 @@ class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>, template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { - return next_thread_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { + return next_thread_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>( + params); } device_t& get_impl_device() { diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_comm.hpp index 0cdfae477..e201db630 100644 --- a/src/common/comm/l0/devices/ccl_gpu_comm.hpp +++ b/src/common/comm/l0/devices/ccl_gpu_comm.hpp @@ -91,12 +91,8 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU> template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; using supported_modules = supported_device_modules<gpu_module_t>; @@ -128,27 +124,33 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU> std::string to_string_impl() const; + // template <ccl_coll_type module_type, + // ccl::group_split_type group_id, + // ccl::device_topology_type class_id, + // class kernel_params> + // gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel() { + // auto& ptr = get_gpu_module<module_type, group_id, class_id>(); + + // using requested_class = kernel_class_t<module_type, group_id, class_id>; + // return ptr.template get_class<requested_class>().template get<kernel_params>(); + // } + template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { auto& ptr = get_gpu_module<module_type, group_id, class_id>(); using requested_class = kernel_class_t<module_type, group_id, class_id>; - return ptr.template get_class<requested_class>().template get<kernel_params>(); + return ptr.template get_class<requested_class>().get(params); } - template <class kernel_params, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class gpu_entry> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry> + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); //threads count!!! return main_func; diff --git a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp index 7e221846f..10fd7b51f 100644 --- a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp +++ b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp @@ -47,12 +47,8 @@ class ccl_gpu_scaleup_proxy template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>; @@ -76,13 +72,11 @@ class ccl_gpu_scaleup_proxy template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { this->template invoke<group_id, class_id>(); - return wrapped_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } template <ccl::group_split_type group_id, ccl::device_topology_type class_id> @@ -90,17 +84,15 @@ class ccl_gpu_scaleup_proxy return wrapped_gpu_comm.template get_comm_data<group_id, class_id>(); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; @@ -139,12 +131,8 @@ class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>> template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>; using device_impl_t = ccl_numa_proxy<device_t>; @@ -174,25 +162,21 @@ class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>> template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { this->template invoke<group_id>(); - return wrapped_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp index 5e9be2de8..10153d131 100644 --- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp +++ b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp @@ -50,12 +50,8 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; using supported_modules = supported_device_modules<gpu_module_t>; @@ -74,16 +70,15 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { auto& ptr = base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>( registered_modules); assert(ptr); using requested_class = kernel_class_t<module_type, group_id, class_id>; - return ptr->template get_class<requested_class>().template get<kernel_params>(); + return ptr->template get_class<requested_class>().get(params); } template <ccl_coll_type module_type, diff --git a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp index 46b73f1a4..cc39a0084 100644 --- a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp +++ b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp @@ -22,7 +22,7 @@ #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp" #include "common/comm/l0/devices/proxy_observer_types.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp" #include "common/comm/l0/devices/communication_structs/ipc_client.hpp" namespace native { @@ -53,12 +53,8 @@ class ccl_ipc_source_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_source_gpu_comm template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; static constexpr const char* name_impl() { return "SOURCE_IPC_GPU"; @@ -135,30 +131,24 @@ class ccl_ipc_source_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_source_gpu_comm */ template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { - return inprocess_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { + return inprocess_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } - template <class kernel_params, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class gpu_entry> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry> + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { static_assert(group_id == ccl::group_split_type::cluster, "ccl_ipc_source_gpu_comm available for ccl::group_split_type::cluster only"); const topology_addr<group_id, class_id>& comm_addr = - base::template get_comm_data<group_id, class_id>(); + inprocess_gpu_comm.template get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); - ipc_invoke_params<gpu_entry::type(), kernel_params> params(entry.get_ipc_data()); + ipc_invoke_params<gpu_entry::type()> params(entry.get_ipc_data(), entry.get_params()); this->template invoke<group_id, class_id>(entry.get_ipc_session_key(), std::move(params)); return main_func; diff --git a/src/common/comm/l0/devices/ccl_numa_proxy.hpp b/src/common/comm/l0/devices/ccl_numa_proxy.hpp index fee81eb87..efd29e93b 100644 --- a/src/common/comm/l0/devices/ccl_numa_proxy.hpp +++ b/src/common/comm/l0/devices/ccl_numa_proxy.hpp @@ -22,7 +22,7 @@ #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp" #include "common/comm/l0/devices/proxy_observer_types.hpp" -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" namespace native { @@ -46,12 +46,8 @@ class ccl_numa_proxy template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::numa_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; static constexpr const char* name_impl() { return "NUMA_PROXY"; @@ -73,21 +69,16 @@ class ccl_numa_proxy template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>(); using requested_class = kernel_class_t<module_type, group_id, class_id>; - return ptr.template get_class<requested_class>().template get<kernel_params>(); + return ptr.template get_class<requested_class>().get(params); } - template <class kernel_params, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class gpu_entry> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry> + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { static_assert(group_id == ccl::group_split_type::cluster, "ccl_numa_proxy available for ccl::group_split_type::cluster only"); @@ -95,31 +86,22 @@ class ccl_numa_proxy base::template get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>; + using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>; kernel_func_type& main_func = - get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); // alloc shared data structure to notify host side with device parital result - observer::invoke_params<gpu_entry::type(), kernel_params> params = entry.get_numa_data(); + observer::invoke_params<gpu_entry::type()> params = entry.get_numa_data(); // invoke host-side context creation this->template invoke<group_id, class_id>(entry.get_numa_session_key(), params); // bind shared data to kernel const auto& out_ctx_params = params.get_ctx_params(); - main_func.template set_arg<typename kernel_func_type::event_prod_chunk_mem_arg>( - out_ctx_params.numa_staged_memory->get()); - main_func.template set_arg<typename kernel_func_type::event_prod_bytes_arg>( - out_ctx_params.staged_memory_size_counter->get()); - - main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_offset_arg>( - out_ctx_params.producer_aggregated_memory_offset->get()); - main_func.template set_arg<typename kernel_func_type::event_consumed_chunk_mem_arg>( - out_ctx_params.total_producers_aggregated_memory->get()); - main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_arg>( - out_ctx_params.total_producers_aggregated_size_counter->get()); + + main_func.bind_data(out_ctx_params); return main_func; } diff --git a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp index 2cd6ce8f2..e08545b53 100644 --- a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp +++ b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp @@ -22,7 +22,7 @@ #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp" #include "common/comm/l0/devices/proxy_observer_types.hpp" -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" namespace native { @@ -52,12 +52,8 @@ class ccl_scaleout_proxy template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; static constexpr const char* name_impl() { return "SCALE_OUT_PROXY"; @@ -79,13 +75,12 @@ class ccl_scaleout_proxy template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>(); using requested_class = kernel_class_t<module_type, group_id, class_id>; - return ptr.template get_class<requested_class>().template get<kernel_params>(); + return ptr.template get_class<requested_class>().get(params); } template <ccl::group_split_type group_id, ccl::device_topology_type class_id> @@ -93,43 +88,32 @@ class ccl_scaleout_proxy return wrapped_gpu_comm.template get_comm_data<group_id, class_id>(); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>; + using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>; kernel_func_type& main_func = - get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); // alloc shared data structure to notify host side with device parital result - observer::invoke_params<gpu_entry::type(), kernel_params> params = entry.get_numa_data(); + observer::invoke_params<gpu_entry::type()> params = entry.get_scaleout_data(); // invoke host-side context creation - this->template invoke<group_id, class_id>(entry.get_numa_session_key(), params); + this->template invoke<group_id, class_id>(entry.get_scaleout_session_key(), params); // bind shared data to kernel const auto& out_ctx_params = params.get_ctx_params(); - main_func.template set_arg<typename kernel_func_type::event_prod_chunk_mem_arg>( - out_ctx_params.numa_staged_memory->get()); - main_func.template set_arg<typename kernel_func_type::event_prod_bytes_arg>( - out_ctx_params.staged_memory_size_counter->get()); - - main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_offset_arg>( - out_ctx_params.producer_aggregated_memory_offset->get()); - main_func.template set_arg<typename kernel_func_type::event_consumed_chunk_mem_arg>( - out_ctx_params.total_producers_aggregated_memory->get()); - main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_arg>( - out_ctx_params.total_producers_aggregated_size_counter->get()); + + main_func.bind_data(out_ctx_params); return main_func; } @@ -168,10 +152,9 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>> template <ccl_coll_type algo_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - using gpu_kernel_t = typename gpu_module_t<algo_type, group_id, class_id>:: - scale_out_cpu_gw_class::template kernel_t<kernel_params>; + ccl::device_topology_type class_id> + using gpu_kernel_t = + typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t; //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>; using device_impl_t = ccl_numa_proxy<device_t>; @@ -196,13 +179,11 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>> template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { this->template invoke<group_id>(); - return wrapped_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } template <ccl::group_split_type group_id, ccl::device_topology_type class_id> @@ -210,17 +191,15 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>> return wrapped_gpu_comm.template get_comm_data<group_id, class_id>(); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; @@ -258,10 +237,9 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>> template <ccl_coll_type algo_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - using gpu_kernel_t = typename gpu_module_t<algo_type, group_id, class_id>:: - scale_out_cpu_gw_class::template kernel_t<kernel_params>; + ccl::device_topology_type class_id> + using gpu_kernel_t = + typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t; //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>; using device_impl_t = ccl_gpu_scaleup_proxy<device_t>; @@ -286,13 +264,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>> template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { this->template invoke<group_id>(); - return wrapped_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } template <ccl::group_split_type group_id, ccl::device_topology_type class_id> @@ -300,17 +276,15 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>> return wrapped_gpu_comm.template get_comm_data<group_id, class_id>(); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; @@ -346,13 +320,9 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>> using gpu_module_t = typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using gpu_kernel_t = - typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class::template kernel_t< - kernel_params>; + typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class::kernel_t; //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>; using device_impl_t = ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>; @@ -377,13 +347,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>> template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { this->template invoke<group_id>(); - return wrapped_gpu_comm - .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>(); + return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params); } template <ccl::group_split_type group_id, ccl::device_topology_type class_id> @@ -391,17 +359,15 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>> return wrapped_gpu_comm.template get_comm_data<group_id, class_id>(); } - template <class kernel_params, - ccl::group_split_type group_id, + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry, class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp index a6cc90ccd..6334b56a4 100644 --- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp +++ b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp @@ -33,12 +33,8 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_ template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class; - template <ccl_coll_type algo_type, - ccl::group_split_type group, - ccl::device_topology_type mode, - class kernel_params> - using gpu_kernel_t = - typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>; + template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode> + using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t; using supported_modules = supported_device_modules<gpu_module_t>; @@ -69,25 +65,20 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_ template <ccl_coll_type module_type, ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class kernel_params> - gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() { + ccl::device_topology_type class_id> + gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) { auto& ptr = get_gpu_module<module_type, group_id, class_id>(); using requested_class = kernel_class_t<module_type, group_id, class_id>; - return ptr.template get_class<requested_class>().template get<kernel_params>(); + return ptr.template get_class<requested_class>().get(params); } - template <class kernel_params, - ccl::group_split_type group_id, - ccl::device_topology_type class_id, - class gpu_entry> - gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry( - gpu_entry& entry) { + template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry> + gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) { const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>(); LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string()); - auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>(); + auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params()); main_func.set_rank(comm_addr.rank); main_func.set_size(comm_addr.size); return main_func; diff --git a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp b/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp index 322b0c6b7..392d9c316 100644 --- a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp +++ b/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp @@ -18,294 +18,219 @@ namespace native { -template <class kernel_params> -struct a2a_allreduce_kernel - : public execution_kernel< - a2a_allreduce_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +namespace a2a { - static constexpr const char* specific_name() { - return "allreduce_execution"; - } +namespace allreduce { - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; +/** + * Common args for all kernel types + */ - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; +// own +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; +using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>; - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; +template <class native_t> +using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>; - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +template <class native_t> +using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>; - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; +using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; +using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +// right +template <class native_t> +using right_tmp_recv_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>; + +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; + +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel : public execution_kernel<main_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg> { + using processing_type = void; - using base = execution_kernel<ring_allreduce_kernel<kernel_params>, + static constexpr const char* specific_name() { + return "allreduce_execution"; + } + + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = execution_kernel<main_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + using base::base; }; -template <class kernel_params> -struct a2a_allreduce_numa_kernel - : public execution_kernel< - a2a_allreduce_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 10, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 11, int*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct numa_kernel + : public execution_kernel<numa_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 10, void*>, + permanent_arg<main_kernel_args::args_start_index + 11, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "allreduce_execution_numa"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data using event_prod_chunk_mem_arg = - thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>; + permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<a2a_allreduce_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + using base::base; }; -template <class kernel_params> -struct a2a_allreduce_ipc - : public ipc_kernel<a2a_allreduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>> { + using processing_type = void; static constexpr const char* specific_name() { return "a2a_allreduce_ipc"; } - using tmp_recv_buf_arg = typename ring_allreduce_kernel<kernel_params>::tmp_recv_buf_arg; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = - typename ring_allreduce_kernel<kernel_params>::income_data_flag_arg; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = - typename ring_allreduce_kernel<kernel_params>::ready_to_recv_flag_arg; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>>; + + using base::base; +}; - using base = execution_kernel<a2a_allreduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - tmp_recv_buf_arg, +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, income_data_flag_arg, ready_to_recv_flag_arg, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>>; -}; + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, -template <class native_type> -struct a2a_allreduce_scale_out_cpu_gw_kernel - : public execution_kernel< - a2a_allreduce_scale_out_cpu_gw_kernel<native_type>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, native_type*>, - arg<main_kernel_args::args_start_index + 2, native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - - thread_safe_arg<main_kernel_args::args_start_index + 10, native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 11, int*>> { - using processing_type = native_type; + // scaleout-specific args + permanent_arg<main_kernel_args::args_start_index + 10, void*>, + permanent_arg<main_kernel_args::args_start_index + 11, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "allreduce_execution_scale_out_cpu_gw"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data using event_prod_chunk_mem_arg = - thread_safe_arg<main_kernel_args::args_start_index + 10, native_type*>; + permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<a2a_allreduce_scale_out_cpu_gw_kernel<native_type>, + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + using base::base; }; +} // namespace allreduce +} // namespace a2a } // namespace native diff --git a/src/common/comm/l0/modules/a2a/allreduce_module.hpp b/src/common/comm/l0/modules/a2a/allreduce_module.hpp index 25f9f2405..41c3eb648 100644 --- a/src/common/comm/l0/modules/a2a/allreduce_module.hpp +++ b/src/common/comm/l0/modules/a2a/allreduce_module.hpp @@ -22,22 +22,22 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_allreduce, ccl::device_topology_type::a2a, - a2a_allreduce_kernel, - a2a_allreduce_numa_kernel, - a2a_allreduce_scale_out_cpu_gw_kernel); + a2a::allreduce::main_kernel, + a2a::allreduce::numa_kernel, + a2a::allreduce::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_allreduce, ccl::device_topology_type::a2a, - a2a_allreduce_ipc, - a2a_allreduce_ipc, - a2a_allreduce_ipc); + a2a::allreduce::ipc_kernel, + a2a::allreduce::ipc_kernel, + a2a::allreduce::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce, ccl::device_topology_type::a2a, - a2a_allreduce_kernel, - a2a_allreduce_numa_kernel, - a2a_allreduce_scale_out_cpu_gw_kernel); + a2a::allreduce::main_kernel, + a2a::allreduce::numa_kernel, + a2a::allreduce::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/gpu_typed_module.hpp b/src/common/comm/l0/modules/gpu_typed_module.hpp index 744d47be9..9825a527e 100644 --- a/src/common/comm/l0/modules/gpu_typed_module.hpp +++ b/src/common/comm/l0/modules/gpu_typed_module.hpp @@ -26,11 +26,11 @@ namespace native { template <ccl_coll_type type, - template <typename> + // template <typename> class kernel_function_impl, - template <typename> + // template <typename> class kernel_numa_function_impl, - template <typename> + // template <typename> class kernel_scale_out_cpu_gw_function_impl> struct real_gpu_typed_module : private gpu_module_base, public kernel_class<type, kernel_function_impl>, @@ -49,17 +49,25 @@ struct real_gpu_typed_module : private gpu_module_base, ccl_coll_type_to_str(type), ", modules handle: ", (void*)module); - ccl_tuple_for_each(main_class::value, - detail::kernel_entry_initializer<type>( - [this](const std::string& name) -> gpu_module_base::kernel_handle { - return this->import_kernel(name); - })); - ccl_tuple_for_each(numa_class::value, + // TODO: is there a nicer way to iterate? + for (auto&& kernel_node : main_class::value) { + detail::kernel_entry_initializer<type>( + [this](const std::string& name) -> gpu_module_base::kernel_handle { + return this->import_kernel(name); + })(kernel_node.second); + } + // ccl_tuple_for_each(main_class::value, + // detail::kernel_entry_initializer<type>( + // [this](const std::string& name) -> gpu_module_base::kernel_handle { + // return this->import_kernel(name); + // })); + + /*ccl_tuple_for_each(numa_class::value, detail::kernel_entry_initializer<type>( [this](const std::string& name) -> gpu_module_base::kernel_handle { return this->import_kernel(name); - })); + }));*/ LOG_DEBUG("Imported functions count: ", functions.size()); } @@ -86,11 +94,11 @@ struct real_gpu_typed_module : private gpu_module_base, //2) virtual ipc_gpu_typed_module template <ccl_coll_type type, - template <typename> + // template <typename> class kernel_function_impl, - template <typename> + // template <typename> class kernel_numa_function_impl, - template <typename> + // template <typename> class kernel_scale_out_cpu_gw_function_impl> struct ipc_gpu_typed_module : private gpu_module_base, public kernel_class<type, kernel_function_impl> { @@ -102,11 +110,17 @@ struct ipc_gpu_typed_module : private gpu_module_base, ipc_gpu_typed_module(handle module_handle) : gpu_module_base(nullptr) { LOG_DEBUG("Remote gpu module created: ", ccl_coll_type_to_str(type)); - ccl_tuple_for_each(main_class::value, - detail::kernel_entry_initializer<type>( - [](const std::string& name) -> gpu_module_base::kernel_handle { - return nullptr; - })); + // ccl_tuple_for_each(main_class::value, + // detail::kernel_entry_initializer<type>( + // [](const std::string& name) -> gpu_module_base::kernel_handle { + // return nullptr; + // })); + for (auto&& kernel : main_class::value) { + detail::kernel_entry_initializer<type>( + [](const std::string& name) -> gpu_module_base::kernel_handle { + return nullptr; + })(kernel.second); + } LOG_DEBUG("No need to import functions"); } @@ -123,11 +137,11 @@ struct ipc_gpu_typed_module : private gpu_module_base, //3) virtual gpu module template <ccl_coll_type type, - template <typename> + // template <typename> class kernel_function_impl, - template <typename> + // template <typename> class kernel_numa_function_impl, - template <typename> + // template <typename> class kernel_scale_out_cpu_gw_function_impl> struct virtual_gpu_typed_module : private gpu_module_base, public kernel_class<type, kernel_function_impl>, @@ -151,16 +165,22 @@ struct virtual_gpu_typed_module : private gpu_module_base, : gpu_module_base(real_module->get()), real_module_ref(real_module) { LOG_DEBUG("Virtual gpu module created:", ccl_coll_type_to_str(type)); - ccl_tuple_for_each(main_class::value, - detail::kernel_entry_initializer<type>( - [this](const std::string& name) -> gpu_module_base::kernel_handle { - return this->import_kernel(name); - })); - ccl_tuple_for_each(numa_class::value, + // ccl_tuple_for_each(main_class::value, + // detail::kernel_entry_initializer<type>( + // [this](const std::string& name) -> gpu_module_base::kernel_handle { + // return this->import_kernel(name); + // })); + for (auto&& kernel : main_class::value) { + detail::kernel_entry_initializer<type>( + [this](const std::string& name) -> gpu_module_base::kernel_handle { + return this->import_kernel(name); + })(kernel.second); + } + /*ccl_tuple_for_each(numa_class::value, detail::kernel_entry_initializer<type>( [this](const std::string& name) -> gpu_module_base::kernel_handle { return this->import_kernel(name); - })); + }));*/ LOG_DEBUG("Linked functions count: ", functions.size()); } diff --git a/src/common/comm/l0/modules/kernel_argument_policies.hpp b/src/common/comm/l0/modules/kernel_argument_policies.hpp index 70c1a8b85..48b2b0f53 100644 --- a/src/common/comm/l0/modules/kernel_argument_policies.hpp +++ b/src/common/comm/l0/modules/kernel_argument_policies.hpp @@ -88,8 +88,8 @@ struct arg_access_policy_atomic { std::atomic<bool> charged{ false }; }; -// Policy that invalidates the value once it's loaded by a consumer. It remains invalid for read untill a producer -// writes an new one +// Policy that invalidates the value once it's loaded by a consumer. +// It remains invalid for read untill a producer writes an new one // Note: only one read/invalidate is supported template <size_t pos, class ArgType, bool must_exist = true> struct arg_access_policy_atomic_reset : public arg_access_policy_atomic<pos, ArgType, must_exist> { diff --git a/src/common/comm/l0/modules/kernel_argument_types.hpp b/src/common/comm/l0/modules/kernel_argument_types.hpp index ec70d42bb..b852811bc 100644 --- a/src/common/comm/l0/modules/kernel_argument_types.hpp +++ b/src/common/comm/l0/modules/kernel_argument_types.hpp @@ -49,7 +49,8 @@ struct kernel_arg : public policy_impl, options { template <size_t pos, class type, class options = options::empty> using thread_safe_arg = kernel_arg<pos, arg_access_policy_atomic<pos, type, false>, options>; -// thread-safe destructive-copying argument (rechargable): used for concurrent read/write applications, where reader take-away exising value +// thread-safe destructive-copying argument (rechargeable): used for concurrent +// read/write applications, where reader take-away existing value template <size_t pos, class type, class options = options::empty> using thread_exchangable_arg = kernel_arg<pos, arg_access_policy_atomic_reset<pos, type, false>, options>; diff --git a/src/common/comm/l0/modules/kernel_class.hpp b/src/common/comm/l0/modules/kernel_class.hpp index 92c0ccfa9..669e8ade5 100644 --- a/src/common/comm/l0/modules/kernel_class.hpp +++ b/src/common/comm/l0/modules/kernel_class.hpp @@ -15,142 +15,105 @@ */ #pragma once #include <tuple> -#include "common/comm/l0/modules/kernel_params.hpp" #include "common/utils/tuple.hpp" +#include <unordered_map> namespace native { -#define SUPPORTED_KERNEL_NATIVE_DATA_TYPES \ - int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, ccl::float16, float, \ - double, ccl::bfloat16 - -template <ccl_coll_type type, template <typename> class kernel_function_impl> +template <ccl_coll_type type, class kernel_function_impl, class Enable = void> struct kernel_class { - template <class native_data_type> - using kernel_param_t = kernel_params_default<native_data_type>; - - template <class kernel_param> - using kernel_t = kernel_function_impl<kernel_param>; - - template <class... native_data_types> - using kernels_t = std::tuple<kernel_t<kernel_param_t<native_data_types>>...>; - - using kernel_class_container_t = kernels_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>; - - // getter - template <class kernel_param> - const kernel_t<kernel_param> &get() const { - return ccl_tuple_get<kernel_t<kernel_param>>(value); - } - - template <class kernel_param> - kernel_t<kernel_param> &get() { - return ccl_tuple_get<kernel_t<kernel_param>>(value); + using kernel_t = kernel_function_impl; + + using key_type = ccl::datatype; + + struct hasher { + size_t operator()(const ccl::datatype& dtype) const { + return std::hash<size_t>{}((size_t)dtype); + } + }; + + using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>; + + kernel_class() { + for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) { + key_type key{ idx }; + // Have to use this ugly inplace construction because kernel_t have deleted copy and move + // constructor and there is no other way to do that. + value.emplace(std::piecewise_construct, + std::make_tuple(key), + std::make_tuple(coll_param_gpu(type, idx))); + } } - -protected: - kernel_class_container_t value; -}; - -template <template <typename> class kernel_function_impl> -struct kernel_class<ccl_coll_allreduce, kernel_function_impl> { - template <class native_data_type, ccl_coll_reduction reduction> - using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>; - - template <class kernel_param> - using kernel_t = kernel_function_impl<kernel_param>; - - template <class first_param, ccl_coll_reduction... second_params> - using kernel_second_params_expanded_t = - std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>; - - template <class... first_params> - using kernel_first_param_expanded_t = decltype(std::tuple_cat( - std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...)); - - using kernel_class_container_t = - kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>; - // getter - template <class kernel_param> - const kernel_t<kernel_param> &get() const { - return ccl_tuple_get<kernel_t<kernel_param>>(value); - } + kernel_t& get(const coll_param_gpu& params) { + assert(!params.is_reduction()); + key_type key{ params.get_datatype() }; - template <class kernel_param> - kernel_t<kernel_param> &get() { - return ccl_tuple_get<kernel_t<kernel_param>>(value); - } + auto it = value.find(key); + if (it == value.end()) { + // TODO: sycl error + throw std::runtime_error("Kernel not found"); + } -protected: - kernel_class_container_t value; -}; - -template <template <typename> class kernel_function_impl> -struct kernel_class<ccl_coll_reduce, kernel_function_impl> { - template <class native_data_type, ccl_coll_reduction reduction> - using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>; - - template <class kernel_param> - using kernel_t = kernel_function_impl<kernel_param>; - - template <class first_param, ccl_coll_reduction... second_params> - using kernel_second_params_expanded_t = - std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>; - - template <class... first_params> - using kernel_first_param_expanded_t = decltype(std::tuple_cat( - std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...)); - - using kernel_class_container_t = - kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>; - - // getter - template <class kernel_param> - const kernel_t<kernel_param> &get() const { - return ccl_tuple_get<kernel_t<kernel_param>>(value); - } - - template <class kernel_param> - kernel_t<kernel_param> &get() { - return ccl_tuple_get<kernel_t<kernel_param>>(value); + return it->second; } protected: kernel_class_container_t value; }; -template <template <typename> class kernel_function_impl> -struct kernel_class<ccl_coll_reduce_scatter, kernel_function_impl> { - template <class native_data_type, ccl_coll_reduction reduction> - using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>; - - template <class kernel_param> - using kernel_t = kernel_function_impl<kernel_param>; - - template <class first_param, ccl_coll_reduction... second_params> - using kernel_second_params_expanded_t = - std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>; +template <ccl_coll_type type, class kernel_function_impl> +struct kernel_class<type, + kernel_function_impl, + typename std::enable_if<is_reduction_coll_type<type>::value>::type> { + using kernel_t = kernel_function_impl; + + using key_type = std::pair<ccl::datatype, ccl::reduction>; + + struct hasher { + size_t operator()(const std::pair<ccl::datatype, ccl::reduction>& key) const { + return std::hash<size_t>{}((size_t)key.first) ^ std::hash<size_t>{}((size_t)key.second); + } + }; + + using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>; + + kernel_class() { + for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) { + // TODO: allow to iterate over reduction values(need to implement operator++) + auto insert_kernel = [this, idx](ccl::reduction red) { + key_type key{ idx, red }; + value.emplace(std::piecewise_construct, + std::make_tuple(key), + std::make_tuple(coll_param_gpu(type, idx, red))); + }; + + insert_kernel(ccl::reduction::sum); + insert_kernel(ccl::reduction::prod); + insert_kernel(ccl::reduction::min); + insert_kernel(ccl::reduction::max); + } + } - template <class... first_params> - using kernel_first_param_expanded_t = decltype(std::tuple_cat( - std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...)); + // getter + kernel_t& get(const coll_param_gpu& params) { + assert(params.is_reduction()); - using kernel_class_container_t = - kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>; + key_type key{ params.get_datatype(), params.get_reduction() }; - // getter - template <class kernel_param> - const kernel_t<kernel_param> &get() const { - return ccl_tuple_get<kernel_t<kernel_param>>(value); - } + auto it = value.find(key); + if (it == value.end()) { + // TODO: sycl error + throw std::runtime_error("Kernel not found"); + } - template <class kernel_param> - kernel_t<kernel_param> &get() { - return ccl_tuple_get<kernel_t<kernel_param>>(value); + return it->second; } protected: + // TODO: threadsafety? Looks like this should be fine as different threads access different devices. + // Need to double check IPC/NUMA case. kernel_class_container_t value; }; + } //namespace native diff --git a/src/common/comm/l0/modules/kernel_functions.hpp b/src/common/comm/l0/modules/kernel_functions.hpp index 704438d6d..e3765ce91 100644 --- a/src/common/comm/l0/modules/kernel_functions.hpp +++ b/src/common/comm/l0/modules/kernel_functions.hpp @@ -15,6 +15,7 @@ */ #pragma once #include "common/comm/l0/modules/kernel_argument_types.hpp" +#include "coll/coll_param.hpp" namespace native { // kernel with its argument collection @@ -70,21 +71,37 @@ struct kernel_data_storage { // major kernel args enum main_kernel_args { rank_index = 0, size_index = 1, args_start_index }; +class kernel_parameters_holder { + coll_param_gpu params; + +public: + kernel_parameters_holder(const coll_param_gpu& params) : params{ params } {} + + const coll_param_gpu& get_kernel_params() const { + return params; + } +}; + //main kernel - used for GPU program execution template <class Impl, class... arguments> struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>, arg<main_kernel_args::size_index, int>, - arguments...> { + arguments...>, + public kernel_parameters_holder { using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>, arg<main_kernel_args::size_index, int>, arguments...>; using base::args; using base::handle; + using params_base = kernel_parameters_holder; + + execution_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {} + using rank_type = int; using size_type = int; - static constexpr const char* name() { + const char* name() { return Impl::specific_name(); } @@ -179,16 +196,22 @@ struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_ } }; -// ipc_kernel - used for GPU data synchronization only +// base_ipc_kernel - used for GPU data synchronization only template <class Impl, class... arguments> -struct ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>, - arg<main_kernel_args::size_index, int>, - arguments...> { +struct base_ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>, + arg<main_kernel_args::size_index, int>, + arguments...>, + public kernel_parameters_holder { using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>, arg<main_kernel_args::size_index, int>, arguments...>; using base::args; using base::handle; + + using params_base = kernel_parameters_holder; + + base_ipc_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {} + static constexpr const char* name() { return Impl::specific_name(); } diff --git a/src/common/comm/l0/modules/kernel_utils.cpp b/src/common/comm/l0/modules/kernel_utils.cpp new file mode 100644 index 000000000..ce1f4c5ae --- /dev/null +++ b/src/common/comm/l0/modules/kernel_utils.cpp @@ -0,0 +1,53 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "common/comm/l0/modules/kernel_utils.hpp" +#include "common/global/global.hpp" + +namespace native { +namespace detail { + +std::string to_string(ccl::reduction red) { +#define P(val) \ + case ccl::reduction::val: return #val; + + switch (red) { + P(sum); + P(prod); + P(min); + P(max); + default: + throw std::runtime_error("Unexpected value of reduction: " + + std::to_string(static_cast<int>(red))); + } + +#undef P +} + +// TODO: ideally we should take a set of all parameters and generate a kernel name +// to execute +std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params) { + // TODO: introduce a simple function to map names? + // Can we remove dtypes from global_data then? Do we need custom datatypes? + auto name = kernel_name + "_" + ccl::global_data::get().dtypes->name(params.get_datatype()); + if (params.is_reduction()) { + name += "_" + to_string(params.get_reduction()); + } + + return name; +} + +} // namespace detail +} // namespace native diff --git a/src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp b/src/common/comm/l0/modules/kernel_utils.hpp similarity index 67% rename from src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp rename to src/common/comm/l0/modules/kernel_utils.hpp index f93db94c3..fc4b82804 100644 --- a/src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp +++ b/src/common/comm/l0/modules/kernel_utils.hpp @@ -13,18 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" +#pragma once + +#include <string> + +#include "coll/coll_param.hpp" namespace native { -namespace observer { +namespace detail { -bool session_key::operator<(const session_key& other) const noexcept { - return hash < other.hash; -} +std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params); -std::string session_key::to_string() const { - return std::to_string(hash); } - -} // namespace observer } // namespace native diff --git a/src/common/comm/l0/modules/modules_utils.hpp b/src/common/comm/l0/modules/modules_utils.hpp index 1bcbd4d70..29d507c2c 100644 --- a/src/common/comm/l0/modules/modules_utils.hpp +++ b/src/common/comm/l0/modules/modules_utils.hpp @@ -17,35 +17,13 @@ #include "common/comm/l0/modules/base_entry_module.hpp" #include "common/utils/tuple.hpp" +#include "common/comm/l0/modules/kernel_utils.hpp" namespace native { namespace detail { -template <ccl_coll_type type, typename = void> -struct kernel_entry_initializer { - using loader_t = - std::function<gpu_module_base::kernel_handle(const std::string& function_name)>; - - kernel_entry_initializer(loader_t&& f) : functor(std::move(f)) {} - - template <class typed_kernel> - void operator()(typed_kernel& kernel) { - kernel.handle = - functor(std::string(typed_kernel::name()) + "_" + - ccl::native_type_info<typename typed_kernel::processing_type>::name()); - } - -private: - loader_t functor; -}; - -// Make template specialization for those collective types, -// which have a multiply reduction ability template <ccl_coll_type type> -struct kernel_entry_initializer< - type, - typename std::enable_if<type == ccl_coll_allreduce || type == ccl_coll_reduce || - type == ccl_coll_reduce_scatter>::type> { +struct kernel_entry_initializer { using loader_t = std::function<gpu_module_base::kernel_handle(const std::string& function_name)>; @@ -53,10 +31,7 @@ struct kernel_entry_initializer< template <class typed_kernel> void operator()(typed_kernel& kernel) { - kernel.handle = - functor(std::string(typed_kernel::name()) + "_" + - ccl::native_type_info<typename typed_kernel::processing_type>::name() + "_" + - reduction_to_str(typed_kernel::param_t::red_type)); + kernel.handle = functor(get_kernel_name(kernel.name(), kernel.get_kernel_params())); } private: diff --git a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp b/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp index acb21f613..24b00a8c5 100644 --- a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_allgatherv, ccl::device_topology_type::ring, - ring_allgatherv_kernel, - ring_allgatherv_numa_kernel, - ring_allgatherv_scale_out_cpu_gw_kernel); + ring::allgatherv::main_kernel, + ring::allgatherv::numa_kernel, + ring::allgatherv::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_allgatherv, ccl::device_topology_type::ring, - ring_allgatherv_ipc, - ring_allgatherv_ipc, - ring_allgatherv_ipc); + ring::allgatherv::ipc_kernel, + ring::allgatherv::ipc_kernel, + ring::allgatherv::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allgatherv, ccl::device_topology_type::ring, - ring_allgatherv_kernel, - ring_allgatherv_numa_kernel, - ring_allgatherv_scale_out_cpu_gw_kernel); + ring::allgatherv::main_kernel, + ring::allgatherv::numa_kernel, + ring::allgatherv::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp b/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp index a3fd720ba..23115a5f9 100644 --- a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp @@ -17,347 +17,225 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_allgatherv_kernel - : public execution_kernel< - ring_allgatherv_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, // elems_count - arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, // send_buf - thread_exchangable_arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // recv_buf - arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // right_output_buffer - external_arg<main_kernel_args::args_start_index + 6, - int*>, // left_wrote_to_me_flag - external_arg<main_kernel_args::args_start_index + 7, - int*>, // i_ready_to_receive_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 8, - int*>, // i_send_to_right_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 9, - int*>> // right_ready_to_recv_flag -{ - using processing_type = typename kernel_params::native_type; - static constexpr const char* specific_name() { - return "allgatherv_execution"; - } +namespace ring { - // elems_count - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; +namespace allgatherv { + +/** + * Common args for all kernel types + */ + +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; +using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; + +using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; +using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; +using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; +using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 3, native_t*>; - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; +template <class native_t> +using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 4, native_t*>; - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; +template <class native_t> +using right_output_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 5, native_t*>; - // right_output_buffer - using right_output_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_output_buf_arg_type = typename right_output_buf_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 6, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - // left_wrote_to_me_flag - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 6, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 7, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 7, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel + : public execution_kernel<main_kernel, + send_buf_size_arg, // elems_count + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + right_output_buf_arg<void>, // right_output_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg> // right_ready_to_recv_flag +{ + using processing_type = void; + + static constexpr const char* specific_name() { + return "allgatherv_execution"; + } + + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; - using base = execution_kernel<ring_allgatherv_kernel<kernel_params>, + using base = execution_kernel<main_kernel, send_buf_size_arg, recv_elem_counts_buf_arg, recv_elem_offsets_buf_arg, - send_buf_arg, - recv_buf_arg, - right_output_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + right_output_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + using base::base; }; // IMPORTANT: the params order is default, see *algatherv*.cl for that -template <class kernel_params> -struct ring_allgatherv_numa_kernel - : public execution_kernel< - ring_allgatherv_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, // elems_count - arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // right_output_buffer - thread_safe_arg<main_kernel_args::args_start_index + 6, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 7, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>> // right_ready_to_recv_flag> +struct numa_kernel + : public execution_kernel<numa_kernel, + send_buf_size_arg, // elems_count + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + right_output_buf_arg<void>, // right_output_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg> // right_ready_to_recv_flag { - using processing_type = typename kernel_params::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "allgatherv_execution_numa"; } - // elems_count - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // right_output_buffer - using right_output_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_output_buf_arg_type = typename right_output_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using base = execution_kernel<ring_allgatherv_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, send_buf_size_arg, recv_elem_counts_buf_arg, recv_elem_offsets_buf_arg, - send_buf_arg, - recv_buf_arg, - right_output_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + right_output_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; -template <class kernel_params> -struct ring_allgatherv_ipc - : public ipc_kernel< - ring_allgatherv_ipc<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, // elems_count - arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // right_output_buffer - thread_safe_arg<main_kernel_args::args_start_index + 6, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 7, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>> // right_ready_to_recv_flag +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + send_buf_size_arg, // elems_count + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + right_output_buf_arg<void>, // right_output_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg> // right_ready_to_recv_flag { - using processing_type = typename kernel_params::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "ring_allgatherv_ipc"; } - // elems_count - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // right_output_buffer - using right_output_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_output_buf_arg_type = typename right_output_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + send_buf_size_arg, + recv_elem_counts_buf_arg, + recv_elem_offsets_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + right_output_buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg>; + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<recv_buf_arg<processing_type>>(recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + } - using base = execution_kernel<ring_allgatherv_ipc<kernel_params>, - send_buf_size_arg, - recv_elem_counts_buf_arg, - recv_elem_offsets_buf_arg, - send_buf_arg, - recv_buf_arg, - right_output_buf_arg, - income_data_flag_arg, - ready_to_recv_flag_arg, - right_income_data_flag_arg, - right_ready_to_recv_flag_arg>; + using base::base; }; -template <class kernel_params> -struct ring_allgatherv_scale_out_cpu_gw_kernel - : public execution_kernel< - ring_allgatherv_scale_out_cpu_gw_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, // elems_count - arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // right_output_buffer - thread_safe_arg<main_kernel_args::args_start_index + 6, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 7, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>> // right_ready_to_recv_flag> +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + send_buf_size_arg, // elems_count + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + right_output_buf_arg<void>, // right_output_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg> // right_ready_to_recv_flag { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "allgatherv_execution_scale_out_cpu_gw"; } - // elems_count - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // right_output_buffer - using right_output_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_output_buf_arg_type = typename right_output_buf_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using base = execution_kernel<ring_allgatherv_scale_out_cpu_gw_kernel<param_t>, + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, recv_elem_counts_buf_arg, recv_elem_offsets_buf_arg, - send_buf_arg, - recv_buf_arg, - right_output_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + right_output_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; +} // namespace allgatherv +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp b/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp index 62bf7e2fd..9eff3d8e5 100644 --- a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_allreduce, ccl::device_topology_type::ring, - ring_allreduce_kernel, - ring_allreduce_numa_kernel, - ring_allreduce_scale_out_cpu_gw_kernel); + ring::allreduce::main_kernel, + ring::allreduce::numa_kernel, + ring::allreduce::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_allreduce, ccl::device_topology_type::ring, - ring_allreduce_ipc, - ring_allreduce_ipc, - ring_allreduce_ipc); + ring::allreduce::ipc_kernel, + ring::allreduce::ipc_kernel, + ring::allreduce::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce, ccl::device_topology_type::ring, - ring_allreduce_kernel, - ring_allreduce_numa_kernel, - ring_allreduce_scale_out_cpu_gw_kernel); + ring::allreduce::main_kernel, + ring::allreduce::numa_kernel, + ring::allreduce::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp index 8e8425251..06152f64b 100644 --- a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp @@ -17,157 +17,115 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_allreduce_kernel - : public execution_kernel< - ring_allreduce_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - external_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - external_arg<main_kernel_args::args_start_index + 4, int*>, - external_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; - static constexpr const char* specific_name() { - return "allreduce_execution"; - } +namespace ring { - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; +namespace allreduce { - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; +/** + * Common args for all kernel types + */ - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; +// own +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; +using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>; - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +template <class native_t> +using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>; - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +template <class native_t> +using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>; - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - //right - using right_tmp_recv_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; +using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; + +// right +template <class native_t> +using right_tmp_recv_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>; + +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; + +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; + +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel : public execution_kernel<main_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg> { + using processing_type = void; + + static constexpr const char* specific_name() { + return "allreduce_execution"; + } - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; - using base = execution_kernel<ring_allreduce_kernel<kernel_params>, + using base = execution_kernel<main_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<void>, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + using base::base; }; -template <class kernel_params> -struct ring_allreduce_numa_kernel - : public execution_kernel< - ring_allreduce_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - - // numa-specific args - permanent_arg<main_kernel_args::args_start_index + 10, - typename kernel_params::native_type*>, - permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>, - permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>, - permanent_arg<main_kernel_args::args_start_index + 13, - typename kernel_params::native_type*>, - permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct numa_kernel : public execution_kernel< + numa_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 10, void*>, + permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>, + permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>, + permanent_arg<main_kernel_args::args_start_index + 13, void*>, + permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> { + using processing_type = void; static constexpr const char* specific_name() { return "allreduce_execution_numa"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = - permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 10, void*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>; @@ -178,22 +136,22 @@ struct ring_allreduce_numa_kernel using event_consumed_bytes_offset_arg_type = typename event_consumed_bytes_offset_arg::arg_type; using event_consumed_chunk_mem_arg = - permanent_arg<main_kernel_args::args_start_index + 13, processing_type*>; + permanent_arg<main_kernel_args::args_start_index + 13, void*>; using event_consumed_chunk_mem_arg_type = typename event_consumed_chunk_mem_arg::arg_type; using event_consumed_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>; using event_consumed_bytes_arg_type = typename event_consumed_bytes_arg::arg_type; - using base = execution_kernel<ring_allreduce_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<void>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, @@ -201,128 +159,105 @@ struct ring_allreduce_numa_kernel event_consumed_bytes_offset_arg, event_consumed_chunk_mem_arg, event_consumed_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + this->template set_arg<event_prod_chunk_mem_arg>( + static_cast<void*>(out_ctx_params.host_mem_producer->get())); + this->template set_arg<event_prod_bytes_arg>( + out_ctx_params.host_mem_producer_counter->get()); + this->template set_arg<event_consumed_bytes_offset_arg>( + out_ctx_params.producer_aggregated_memory_offset->get()); + this->template set_arg<event_consumed_chunk_mem_arg>( + static_cast<void*>(out_ctx_params.dev_mem_consumer->get())); + this->template set_arg<event_consumed_bytes_arg>( + out_ctx_params.dev_mem_consumer_counter->get()); + } + + using base::base; }; -template <class kernel_params> -struct ring_allreduce_ipc - : public ipc_kernel<ring_allreduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>> { + using processing_type = void; + + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; static constexpr const char* specific_name() { return "ring_allreduce_ipc"; } - using tmp_recv_buf_arg = typename ring_allreduce_kernel<kernel_params>::tmp_recv_buf_arg; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = - typename ring_allreduce_kernel<kernel_params>::income_data_flag_arg; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = - typename ring_allreduce_kernel<kernel_params>::ready_to_recv_flag_arg; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + using base = base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>>; + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + } - using base = execution_kernel<ring_allreduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - tmp_recv_buf_arg, - income_data_flag_arg, - ready_to_recv_flag_arg, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>>; + using base::base; }; -template <class kernel_params> -struct ring_allreduce_scale_out_cpu_gw_kernel +struct scale_out_cpu_gw_kernel : public execution_kernel< - ring_allreduce_scale_out_cpu_gw_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, + scale_out_cpu_gw_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, // scaleout-specific args - permanent_arg<main_kernel_args::args_start_index + 10, - typename kernel_params::native_type*>, + permanent_arg<main_kernel_args::args_start_index + 10, void*>, permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>, permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>, - permanent_arg<main_kernel_args::args_start_index + 13, - typename kernel_params::native_type*>, + permanent_arg<main_kernel_args::args_start_index + 13, void*>, permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "allreduce_execution_scale_out_cpu_gw"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = - permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 10, void*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>; @@ -333,22 +268,22 @@ struct ring_allreduce_scale_out_cpu_gw_kernel using event_consumed_bytes_offset_arg_type = typename event_consumed_bytes_offset_arg::arg_type; using event_consumed_chunk_mem_arg = - permanent_arg<main_kernel_args::args_start_index + 13, processing_type*>; + permanent_arg<main_kernel_args::args_start_index + 13, void*>; using event_consumed_chunk_mem_arg_type = typename event_consumed_chunk_mem_arg::arg_type; using event_consumed_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>; using event_consumed_bytes_arg_type = typename event_consumed_bytes_arg::arg_type; - using base = execution_kernel<ring_allreduce_scale_out_cpu_gw_kernel<kernel_params>, + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, @@ -356,5 +291,24 @@ struct ring_allreduce_scale_out_cpu_gw_kernel event_consumed_bytes_offset_arg, event_consumed_chunk_mem_arg, event_consumed_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + this->template set_arg<event_prod_chunk_mem_arg>( + static_cast<void*>(out_ctx_params.host_mem_producer->get())); + this->template set_arg<event_prod_bytes_arg>( + out_ctx_params.host_mem_producer_counter->get()); + this->template set_arg<event_consumed_bytes_offset_arg>( + out_ctx_params.producer_aggregated_memory_offset->get()); + this->template set_arg<event_consumed_chunk_mem_arg>( + static_cast<void*>(out_ctx_params.dev_mem_consumer->get())); + this->template set_arg<event_consumed_bytes_arg>( + out_ctx_params.dev_mem_consumer_counter->get()); + } + + using base::base; }; + +} // namespace allreduce +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp b/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp index a412ed86b..e03917339 100644 --- a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_alltoallv, ccl::device_topology_type::ring, - ring_alltoallv_kernel, - ring_alltoallv_numa_kernel, - ring_alltoallv_scale_out_cpu_gw_kernel); + ring::alltoallv::main_kernel, + ring::alltoallv::numa_kernel, + ring::alltoallv::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_alltoallv, ccl::device_topology_type::ring, - ring_alltoallv_ipc, - ring_alltoallv_ipc, - ring_alltoallv_ipc); + ring::alltoallv::ipc_kernel, + ring::alltoallv::ipc_kernel, + ring::alltoallv::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_alltoallv, ccl::device_topology_type::ring, - ring_alltoallv_kernel, - ring_alltoallv_numa_kernel, - ring_alltoallv_scale_out_cpu_gw_kernel); + ring::alltoallv::main_kernel, + ring::alltoallv::numa_kernel, + ring::alltoallv::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp b/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp index 62cb09550..fe71d51a3 100644 --- a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp @@ -17,461 +17,273 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_alltoallv_kernel - : public execution_kernel< - ring_alltoallv_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts - arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // recv_buf - external_arg<main_kernel_args::args_start_index + 6, - typename kernel_params::native_type*>, // tmp_buffer - thread_exchangable_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, // right_temp_buffer - external_arg<main_kernel_args::args_start_index + 8, - int*>, // left_wrote_to_me_flag - external_arg<main_kernel_args::args_start_index + 9, - int*>, // i_ready_to_receive_flag - external_arg<main_kernel_args::args_start_index + 10, - int*>, // proxy_size_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 11, - int*>, // i_send_to_right_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 12, - int*>, // right_ready_to_recv_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 13, - int*>> // right_proxy_size_flag + +namespace ring { + +namespace alltoallv { + +/** + * Common args for all kernel types + */ + +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>; +using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; + +using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; +using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type; + +using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; +using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; + +using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>; +using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; + +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 4, native_t*>; + +template <class native_t> +using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, native_t*>; + +template <class native_t> +using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 6, native_t*>; + +template <class native_t> +using right_tmp_recv_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>; + +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 8, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; + +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 9, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + +using proxy_size_flag_arg = external_arg<main_kernel_args::args_start_index + 10, int*>; +using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type; + +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 11, int*>; + +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 12, int*>; + +using right_proxy_size_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 13, int*>; + +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel + : public execution_kernel<main_kernel, + send_buf_size_arg, // send_elem_counts + send_elem_offsets_buf_arg, // send_elem_offsets + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf + tmp_recv_buf_arg<void>, // tmp_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + proxy_size_flag_arg, // proxy_size_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + right_proxy_size_flag_arg> // right_proxy_size_flag { - using processing_type = typename kernel_params::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "alltoallv_execution"; } - // send_elem_counts - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // send_elem_offsets - using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // tmp_buffer - using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 6, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - // right_temp_buffer - using right_tmp_recv_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 8, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 9, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // proxy_size_flag - using proxy_size_flag_arg = external_arg<main_kernel_args::args_start_index + 10, int*>; - using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 11, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 12, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - // right_proxy_size_flag - using right_proxy_size_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 13, int*>; - using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type; - - using base = execution_kernel<ring_alltoallv_kernel<kernel_params>, + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = execution_kernel<main_kernel, send_buf_size_arg, // 0 send_elem_counts send_elem_offsets_buf_arg, // 1 send_elem_offsets recv_elem_counts_buf_arg, // 2 recv_elem_counts recv_elem_offsets_buf_arg, // 3 recv_elem_offsets - send_buf_arg, // 4 send_buf_arg - recv_buf_arg, // 5 recv_buf_arg - tmp_recv_buf_arg, // 6 tmp_buffer - right_tmp_recv_buf_arg, // 7 right_temp_buffer + send_buf_arg<processing_type>, // 4 send_buf_arg + recv_buf_arg<processing_type>, // 5 recv_buf_arg + tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer + right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer income_data_flag_arg, // 8 left_wrote_to_me_flag ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag proxy_size_flag_arg, // 10 proxy_size_flag_arg right_income_data_flag_arg, // 11 i_send_to_right_flag right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag right_proxy_size_flag_arg>; // 13 right_proxy_size_flag + + using base::base; }; // IMPORTANT: the params order is default, see *altoallv*.cl for that -template <class kernel_params> -struct ring_alltoallv_numa_kernel - : public execution_kernel< - ring_alltoallv_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts - arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 6, - typename kernel_params::native_type*>, // tmp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, // right_temp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 8, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag - thread_safe_arg<main_kernel_args::args_start_index + 11, - int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 12, - int*>, // right_ready_to_recv_flag - thread_safe_arg<main_kernel_args::args_start_index + 13, - int*>> // right_proxy_size_flag +struct numa_kernel + : public execution_kernel<numa_kernel, + send_buf_size_arg, // send_elem_counts + send_elem_offsets_buf_arg, // send_elem_offsets + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf + tmp_recv_buf_arg<void>, // tmp_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + proxy_size_flag_arg, // proxy_size_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + right_proxy_size_flag_arg> // right_proxy_size_flag { - using processing_type = typename kernel_params::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "alltoallv_execution_numa"; } - // send_elem_counts - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // send_elem_offsets - using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // tmp_buffer - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - // right_temp_buffer - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // proxy_size_flag - using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>; - using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 12, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - // right_proxy_size_flag - using right_proxy_size_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 13, int*>; - using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type; - - using base = execution_kernel<ring_alltoallv_numa_kernel<kernel_params>, + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = execution_kernel<numa_kernel, send_buf_size_arg, // 0 send_elem_counts send_elem_offsets_buf_arg, // 1 send_elem_offsets recv_elem_counts_buf_arg, // 2 recv_elem_counts recv_elem_offsets_buf_arg, // 3 recv_elem_offsets - send_buf_arg, // 4 send_buf_arg - recv_buf_arg, // 5 recv_buf_arg - tmp_recv_buf_arg, // 6 tmp_buffer - right_tmp_recv_buf_arg, // 7 right_temp_buffer + send_buf_arg<processing_type>, // 4 send_buf_arg + recv_buf_arg<processing_type>, // 5 recv_buf_arg + tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer + right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer income_data_flag_arg, // 8 left_wrote_to_me_flag ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag proxy_size_flag_arg, // 10 proxy_size_flag_arg right_income_data_flag_arg, // 11 i_send_to_right_flag right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag right_proxy_size_flag_arg>; // 13 right_proxy_size_flag + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; -template <class kernel_params> -struct ring_alltoallv_ipc - : public ipc_kernel< - ring_alltoallv_ipc<kernel_params>, - arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts - arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 6, - typename kernel_params::native_type*>, // tmp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, // right_temp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 8, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag - thread_safe_arg<main_kernel_args::args_start_index + 11, - int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 12, - int*>, // right_ready_to_recv_flag - thread_safe_arg<main_kernel_args::args_start_index + 13, - int*>> // right_proxy_size_flag +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + send_buf_size_arg, // send_elem_counts + send_elem_offsets_buf_arg, // send_elem_offsets + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf + tmp_recv_buf_arg<void>, // tmp_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + proxy_size_flag_arg, // proxy_size_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + right_proxy_size_flag_arg> // right_proxy_size_flag { - using processing_type = typename kernel_params::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "ring_alltoallv_ipc"; } - // send_elem_counts - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // send_elem_offsets - using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // tmp_buffer - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - // right_temp_buffer - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // proxy_size_flag - using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>; - using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 12, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - // right_proxy_size_flag - using right_proxy_size_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 13, int*>; - using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type; - - using base = execution_kernel<ring_alltoallv_ipc<kernel_params>, - send_buf_size_arg, // 0 send_elem_counts - send_elem_offsets_buf_arg, // 1 send_elem_offsets - recv_elem_counts_buf_arg, // 2 recv_elem_counts - recv_elem_offsets_buf_arg, // 3 recv_elem_offsets - send_buf_arg, // 4 send_buf_arg - recv_buf_arg, // 5 recv_buf_arg - tmp_recv_buf_arg, // 6 tmp_buffer - right_tmp_recv_buf_arg, // 7 right_temp_buffer - income_data_flag_arg, // 8 left_wrote_to_me_flag - ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag - proxy_size_flag_arg, // 10 proxy_size_flag_arg - right_income_data_flag_arg, // 11 i_send_to_right_flag - right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag - right_proxy_size_flag_arg>; // 13 right_proxy_size_flag + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + send_buf_size_arg, // 0 send_elem_counts + send_elem_offsets_buf_arg, // 1 send_elem_offsets + recv_elem_counts_buf_arg, // 2 recv_elem_counts + recv_elem_offsets_buf_arg, // 3 recv_elem_offsets + send_buf_arg<processing_type>, // 4 send_buf_arg + recv_buf_arg<processing_type>, // 5 recv_buf_arg + tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer + right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer + income_data_flag_arg, // 8 left_wrote_to_me_flag + ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag + proxy_size_flag_arg, // 10 proxy_size_flag_arg + right_income_data_flag_arg, // 11 i_send_to_right_flag + right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag + right_proxy_size_flag_arg>; // 13 right_proxy_size_flag + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + + auto proxy_size_flag = + reinterpret_cast<proxy_size_flag_arg_type>(ipc_handles.at(3).get().pointer); + this->template set_arg<proxy_size_flag_arg>(proxy_size_flag); + } + + using base::base; }; -template <class kernel_params> -struct ring_alltoallv_scale_out_cpu_gw_kernel - : public execution_kernel< - ring_alltoallv_scale_out_cpu_gw_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts - arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets - arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf - arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf - arg<main_kernel_args::args_start_index + 4, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, // recv_buf - thread_safe_arg<main_kernel_args::args_start_index + 6, - typename kernel_params::native_type*>, // tmp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, // right_temp_buffer - thread_safe_arg<main_kernel_args::args_start_index + 8, - int*>, // left_wrote_to_me_flag - thread_safe_arg<main_kernel_args::args_start_index + 9, - int*>, // i_ready_to_receive_flag - thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag - thread_safe_arg<main_kernel_args::args_start_index + 11, - int*>, // i_send_to_right_flag - thread_safe_arg<main_kernel_args::args_start_index + 12, - int*>, // right_ready_to_recv_flag - thread_safe_arg<main_kernel_args::args_start_index + 13, - int*>> // right_proxy_size_flag +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + send_buf_size_arg, // send_elem_counts + send_elem_offsets_buf_arg, // send_elem_offsets + recv_elem_counts_buf_arg, // recv_elem_counts_buf + recv_elem_offsets_buf_arg, // recv_elem_offsets_buf + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf + tmp_recv_buf_arg<void>, // tmp_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + proxy_size_flag_arg, // proxy_size_flag + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + right_proxy_size_flag_arg> // right_proxy_size_flag { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; + using processing_type = void; static constexpr const char* specific_name() { return "alltoallv_execution_scale_out_cpu_gw"; } - // send_elem_counts - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>; using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - // send_elem_offsets - using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>; - using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type; - - // recv_elem_counts_buf - using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>; - using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type; - - // recv_elem_offsets_buf - using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>; - using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type; - - // send_buf - using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - // recv_buf - using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - // tmp_buffer - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - // right_temp_buffer - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - // left_wrote_to_me_flag - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - // i_ready_to_receive_flag - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - // proxy_size_flag - using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>; - using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type; - - // i_send_to_right_flag - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - // right_ready_to_recv_flag - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 12, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - // right_proxy_size_flag - using right_proxy_size_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 13, int*>; - using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type; - - using base = execution_kernel<ring_alltoallv_scale_out_cpu_gw_kernel<kernel_params>, + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, // 0 send_elem_counts send_elem_offsets_buf_arg, // 1 send_elem_offsets recv_elem_counts_buf_arg, // 2 recv_elem_counts recv_elem_offsets_buf_arg, // 3 recv_elem_offsets - send_buf_arg, // 4 send_buf_arg - recv_buf_arg, // 5 recv_buf_arg - tmp_recv_buf_arg, // 6 tmp_buffer - right_tmp_recv_buf_arg, // 7 right_temp_buffer + send_buf_arg<processing_type>, // 4 send_buf_arg + recv_buf_arg<processing_type>, // 5 recv_buf_arg + tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer + right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer income_data_flag_arg, // 8 left_wrote_to_me_flag ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag proxy_size_flag_arg, // 10 proxy_size_flag_arg right_income_data_flag_arg, // 11 i_send_to_right_flag right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag right_proxy_size_flag_arg>; // 13 right_proxy_size_flag + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; + +} // namespace alltoallv +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp b/src/common/comm/l0/modules/ring/bcast_entry_module.hpp index c87003dda..c308d25db 100644 --- a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/bcast_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_bcast, ccl::device_topology_type::ring, - ring_bcast_kernel, - ring_bcast_numa_kernel, - ring_bcast_scale_out_cpu_gw_kernel); + ring::bcast::main_kernel, + ring::bcast::numa_kernel, + ring::bcast::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_bcast, ccl::device_topology_type::ring, - ring_bcast_ipc, - ring_bcast_ipc, - ring_bcast_ipc); + ring::bcast::ipc_kernel, + ring::bcast::ipc_kernel, + ring::bcast::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_bcast, ccl::device_topology_type::ring, - ring_bcast_kernel, - ring_bcast_numa_kernel, - ring_bcast_scale_out_cpu_gw_kernel); + ring::bcast::main_kernel, + ring::bcast::numa_kernel, + ring::bcast::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp b/src/common/comm/l0/modules/ring/bcast_export_functions.hpp index a2f157c86..1d9a610ac 100644 --- a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/bcast_export_functions.hpp @@ -17,263 +17,233 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_bcast_kernel : public execution_kernel< - ring_bcast_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - thread_exchangable_arg<main_kernel_args::args_start_index + 1, - typename kernel_params::native_type*>, - external_arg<main_kernel_args::args_start_index + 2, int*>, - external_arg<main_kernel_args::args_start_index + 3, int*>, - arg<main_kernel_args::args_start_index + 4, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>, - arg<main_kernel_args::args_start_index + 8, size_t>> { - using processing_type = typename kernel_params::native_type; - static constexpr const char* specific_name() { - return "bcast_execution"; - } +namespace ring { - //own - using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using common_entry_buf_size_arg = buf_size_arg; - using buf_size_arg_type = typename buf_size_arg::arg_type; +namespace bcast { + +/** + * Common args for all kernel types + */ + +using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; +using buf_size_arg_type = typename buf_size_arg::arg_type; + +template <class native_t> +using buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 1, native_t*>; - using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = buf_arg; - using buf_arg_type = typename buf_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 2, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 2, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 3, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 3, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>; +using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; +template <class native_t> +using right_buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 5, native_t*>; - //right - using right_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_buf_arg_type = typename right_buf_arg::arg_type; +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>; +using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>; +using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; +using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>; +using root_arg_type = typename root_arg::arg_type; - using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>; - using root_arg_type = typename root_arg::arg_type; +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel : public execution_kernel<main_kernel, + buf_size_arg, + buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg> { + using processing_type = void; - using base = execution_kernel<ring_bcast_kernel<kernel_params>, + static constexpr const char* specific_name() { + return "bcast_execution"; + } + + using common_entry_buf_size_arg = buf_size_arg; + using common_entry_buf_arg = buf_arg<processing_type>; + + using base = execution_kernel<main_kernel, buf_size_arg, - buf_arg, + buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_buf_arg, + right_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg>; + + using base::base; }; -template <class kernel_params> -struct ring_bcast_numa_kernel - : public execution_kernel<ring_bcast_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 1, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 2, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, int*>, - arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, int*>, - arg<main_kernel_args::args_start_index + 8, size_t>, - - thread_safe_arg<main_kernel_args::args_start_index + 9, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 10, int*>> { - using processing_type = typename kernel_params::native_type; +struct numa_kernel + : public execution_kernel<numa_kernel, + buf_size_arg, + buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg, + + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 9, void*>, + permanent_arg<main_kernel_args::args_start_index + 10, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "bcast_execution_numa"; } - //own - using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using buf_size_arg_type = typename buf_size_arg::arg_type; - - using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using buf_arg_type = typename buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 2, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 3, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_buf_arg_type = typename right_buf_arg::arg_type; - - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 6, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>; - using root_arg_type = typename root_arg::arg_type; + using common_entry_buf_arg = buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, - typename kernel_params::native_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 9, void*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 10, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_bcast_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, buf_size_arg, - buf_arg, + buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_buf_arg, + right_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; -template <class kernel_params> -struct ring_bcast_ipc - : public ipc_kernel<ring_bcast_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - thread_safe_arg<main_kernel_args::args_start_index + 1, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 2, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, int*>, - stub_arg<main_kernel_args::args_start_index + 4>, - stub_arg<main_kernel_args::args_start_index + 5>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>> { - using processing_type = typename kernel_params::native_type; +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 4>, + stub_arg<main_kernel_args::args_start_index + 5>, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>> { + using processing_type = void; static constexpr const char* specific_name() { return "ring_bcast_ipc"; } - using recv_buf_arg = typename ring_bcast_kernel<kernel_params>::buf_arg; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using income_data_flag_arg = typename ring_bcast_kernel<kernel_params>::income_data_flag_arg; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; + using common_entry_buf_arg = buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 4>, + stub_arg<main_kernel_args::args_start_index + 5>, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>>; + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto recv_buf = reinterpret_cast<typename buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<buf_arg<processing_type>>(recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + } - using ready_to_recv_flag_arg = - typename ring_bcast_kernel<kernel_params>::ready_to_recv_flag_arg; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + using base::base; +}; - using base = execution_kernel<ring_bcast_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - recv_buf_arg, +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + buf_size_arg, + buf_arg<void>, income_data_flag_arg, ready_to_recv_flag_arg, - stub_arg<main_kernel_args::args_start_index + 4>, - stub_arg<main_kernel_args::args_start_index + 5>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>>; -}; + local_barrier_flag_arg, + right_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg, -template <class kernel_params> -struct ring_bcast_scale_out_cpu_gw_kernel - : public execution_kernel<ring_bcast_scale_out_cpu_gw_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 1, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 2, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, int*>, - arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, int*>, - arg<main_kernel_args::args_start_index + 8, size_t>, - - thread_safe_arg<main_kernel_args::args_start_index + 9, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 10, int*>> { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; + // scaleout-specific args + permanent_arg<main_kernel_args::args_start_index + 9, void*>, + permanent_arg<main_kernel_args::args_start_index + 10, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "bcast_execution_scale_out_cpu_gw"; } - //own - using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using buf_size_arg_type = typename buf_size_arg::arg_type; - - using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using buf_arg_type = typename buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 2, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 3, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>; - using right_buf_arg_type = typename right_buf_arg::arg_type; - - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 6, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>; - using root_arg_type = typename root_arg::arg_type; + using common_entry_buf_arg = buf_arg<processing_type>; // event data using event_prod_chunk_mem_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, processing_type*>; + permanent_arg<main_kernel_args::args_start_index + 9, processing_type*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 10, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_bcast_scale_out_cpu_gw_kernel<kernel_params>, + using base = execution_kernel<scale_out_cpu_gw_kernel, buf_size_arg, - buf_arg, + buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_buf_arg, + right_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; + +} // namespace bcast +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_entry_module.hpp index c9a813038..9f3241e33 100644 --- a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/reduce_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_reduce, ccl::device_topology_type::ring, - ring_reduce_kernel, - ring_reduce_numa_kernel, - ring_reduce_scale_out_cpu_gw_kernel); + ring::reduce::main_kernel, + ring::reduce::numa_kernel, + ring::reduce::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_reduce, ccl::device_topology_type::ring, - ring_reduce_ipc, - ring_reduce_ipc, - ring_reduce_ipc); + ring::reduce::ipc_kernel, + ring::reduce::ipc_kernel, + ring::reduce::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce, ccl::device_topology_type::ring, - ring_reduce_kernel, - ring_reduce_numa_kernel, - ring_reduce_scale_out_cpu_gw_kernel); + ring::reduce::main_kernel, + ring::reduce::numa_kernel, + ring::reduce::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_export_functions.hpp index 55fe5a569..a07e27087 100644 --- a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/reduce_export_functions.hpp @@ -17,315 +17,256 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_reduce_kernel - : public execution_kernel< - ring_reduce_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - external_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - external_arg<main_kernel_args::args_start_index + 4, int*>, - external_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>, - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>, - arg<main_kernel_args::args_start_index + 10, size_t>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; - static constexpr const char* specific_name() { - return "reduce_execution"; - } +namespace ring { - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; +namespace reduce { - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; +/** + * Common args for all kernel types + */ - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; +using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>; - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +template <class native_t> +using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>; - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +template <class native_t> +using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>; - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - //right - using right_tmp_recv_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; +using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; + +template <class native_t> +using right_tmp_recv_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>; + +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>; +using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; + +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; +using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; +using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>; +using root_arg_type = typename root_arg::arg_type; - using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>; - using root_arg_type = typename root_arg::arg_type; +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel : public execution_kernel<main_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg> { + using processing_type = void; - using base = execution_kernel<ring_reduce_kernel<kernel_params>, + static constexpr const char* specific_name() { + return "reduce_execution"; + } + + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = execution_kernel<main_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg>; + + using base::base; }; -template <class kernel_params> -struct ring_reduce_numa_kernel - : public execution_kernel< - ring_reduce_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - arg<main_kernel_args::args_start_index + 10, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 11, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 12, int*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct numa_kernel + : public execution_kernel<numa_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg, + + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 11, void*>, + permanent_arg<main_kernel_args::args_start_index + 12, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "reduce_execution_numa"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>; - using root_arg_type = typename root_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, - typename kernel_params::native_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, void*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_reduce_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; -template <class kernel_params> -struct ring_reduce_ipc - : public ipc_kernel<ring_reduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<void>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>, + stub_arg<main_kernel_args::args_start_index + 10>> { + using processing_type = void; static constexpr const char* specific_name() { return "ring_reduce_ipc"; } - using tmp_recv_buf_arg = typename ring_reduce_kernel<kernel_params>::tmp_recv_buf_arg; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = typename ring_reduce_kernel<kernel_params>::income_data_flag_arg; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + stub_arg<main_kernel_args::args_start_index + 2>, + tmp_recv_buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>, + stub_arg<main_kernel_args::args_start_index + 10>>; + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + } - using ready_to_recv_flag_arg = - typename ring_reduce_kernel<kernel_params>::ready_to_recv_flag_arg; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + using base::base; +}; - using base = execution_kernel<ring_reduce_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - tmp_recv_buf_arg, +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + send_buf_size_arg, + send_buf_arg<void>, + recv_buf_arg<void>, + tmp_recv_buf_arg<void>, income_data_flag_arg, ready_to_recv_flag_arg, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>>; -}; + local_barrier_flag_arg, + right_tmp_recv_buf_arg<void>, + right_income_data_flag_arg, + right_ready_to_recv_flag_arg, + root_arg, -template <class kernel_params> -struct ring_reduce_scale_out_cpu_gw_kernel - : public execution_kernel< - ring_reduce_scale_out_cpu_gw_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - arg<main_kernel_args::args_start_index + 10, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 11, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 12, int*>> { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 11, void*>, + permanent_arg<main_kernel_args::args_start_index + 12, int*>> { + using processing_type = void; static constexpr const char* specific_name() { return "reduce_execution_scale_out_cpu_gw"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; - - using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>; - using root_arg_type = typename root_arg::arg_type; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data using event_prod_chunk_mem_arg = - thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>; + permanent_arg<main_kernel_args::args_start_index + 11, processing_type*>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_reduce_scale_out_cpu_gw_kernel<kernel_params>, + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, root_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; +} // namespace reduce +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp index 52e98a915..44ae2a55a 100644 --- a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp +++ b/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp @@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module, real_gpu_typed_module, ccl_coll_reduce_scatter, ccl::device_topology_type::ring, - ring_reduce_scatter_kernel, - ring_reduce_scatter_numa_kernel, - ring_reduce_scatter_scale_out_cpu_gw_kernel); + ring::reduce_scatter::main_kernel, + ring::reduce_scatter::numa_kernel, + ring::reduce_scatter::scale_out_cpu_gw_kernel); DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module, ipc_gpu_typed_module, ccl_coll_reduce_scatter, ccl::device_topology_type::ring, - ring_reduce_scatter_ipc, - ring_reduce_scatter_ipc, - ring_reduce_scatter_ipc); + ring::reduce_scatter::ipc_kernel, + ring::reduce_scatter::ipc_kernel, + ring::reduce_scatter::ipc_kernel); DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce_scatter, ccl::device_topology_type::ring, - ring_reduce_scatter_kernel, - ring_reduce_scatter_numa_kernel, - ring_reduce_scatter_scale_out_cpu_gw_kernel); + ring::reduce_scatter::main_kernel, + ring::reduce_scatter::numa_kernel, + ring::reduce_scatter::scale_out_cpu_gw_kernel); } // namespace native diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp index 5b2561d1c..f1f3789ff 100644 --- a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp +++ b/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp @@ -17,312 +17,262 @@ #include "common/comm/l0/modules/kernel_functions.hpp" namespace native { -template <class kernel_params> -struct ring_reduce_scatter_kernel - : public execution_kernel< - ring_reduce_scatter_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, // recv_count - arg<main_kernel_args::args_start_index + 1, - typename kernel_params::native_type*>, // send_buf - arg<main_kernel_args::args_start_index + 2, - typename kernel_params::native_type*>, // recv_buf - external_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, // tmp_buf - external_arg<main_kernel_args::args_start_index + 4, int*>, // left_wrote_to_me_flag - external_arg<main_kernel_args::args_start_index + 5, int*>, // i_ready_to_receive_flag - arg<main_kernel_args::args_start_index + 6, int*>, // local_barrier_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, // right_output_buffer - thread_exchangable_arg<main_kernel_args::args_start_index + 8, - typename kernel_params::native_type*>, // right_temp_buffer - thread_exchangable_arg<main_kernel_args::args_start_index + 9, - int*>, // i_send_to_right_flag - thread_exchangable_arg<main_kernel_args::args_start_index + 10, - int*>> { // right_ready_to_recv_flag - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; - static constexpr const char* specific_name() { - return "reduce_scatter_execution"; - } +namespace ring { - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using common_entry_buf_size_arg = send_buf_size_arg; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; +namespace reduce_scatter { + +/** + * Common args for all kernel types + */ + +using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using common_entry_buf_arg = send_buf_arg; - using send_buf_arg_type = typename send_buf_arg::arg_type; +// TODO: since we use only a single type, remove template parameter here +template <class native_t> +using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>; - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; +template <class native_t> +using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 2, native_t*>; - using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; +template <class native_t> +using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>; - using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; +using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>; +using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; +using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>; +using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; +using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; +using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - using right_output_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_output_buf_arg_type = typename right_output_buf_arg::arg_type; +template <class native_t> +using right_output_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>; - //right - using right_tmp_recv_buf_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 8, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; +template <class native_t> +using right_tmp_recv_buf_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 8, native_t*>; - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type;*/ +using right_income_data_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_income_data_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; +using right_ready_to_recv_flag_arg = + thread_exchangable_arg<main_kernel_args::args_start_index + 10, int*>; - using right_ready_to_recv_flag_arg = - thread_exchangable_arg<main_kernel_args::args_start_index + 10, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; +// IMPORTANT: the number and types of arguments must be the same in all classes, +// excluding arguments specific for numa/scaleout etc. +struct main_kernel + : public execution_kernel<main_kernel, + send_buf_size_arg, // recv_count + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + tmp_recv_buf_arg<void>, // tmp_buf + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + local_barrier_flag_arg, // local_barrier_flag + right_output_buf_arg<void>, // right_output_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg> { // right_ready_to_recv_flag + using processing_type = void; + + static constexpr const char* specific_name() { + return "reduce_scatter_execution"; + } + + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; - using base = execution_kernel<ring_reduce_scatter_kernel<kernel_params>, + using base = execution_kernel<main_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_output_buf_arg, - right_tmp_recv_buf_arg, + right_output_buf_arg<processing_type>, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg>; + + using base::base; }; -template <class kernel_params> -struct ring_reduce_scatter_numa_kernel - : public execution_kernel< - ring_reduce_scatter_numa_kernel<kernel_params>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - arg<main_kernel_args::args_start_index + 10, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 11, - typename kernel_params::native_type*>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct numa_kernel + : public execution_kernel<numa_kernel, + send_buf_size_arg, // recv_count + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + tmp_recv_buf_arg<void>, // tmp_buf + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + local_barrier_flag_arg, // local_barrier_flag + right_output_buf_arg<void>, // right_output_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + + // numa-specific args + permanent_arg<main_kernel_args::args_start_index + 11, size_t>, + permanent_arg<main_kernel_args::args_start_index + 12, void*>> { + using processing_type = void; static constexpr const char* specific_name() { return "reduce_scatter_execution_numa"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, - typename kernel_params::native_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_reduce_scatter_numa_kernel<kernel_params>, + using base = execution_kernel<numa_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_output_buf_arg<processing_type>, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; -template <class kernel_params> -struct ring_reduce_scatter_ipc - : public ipc_kernel<ring_reduce_scatter_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>> { - using param_t = kernel_params; - using processing_type = typename kernel_params::native_type; +struct ipc_kernel : public base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + recv_buf_arg<void>, // recv_buf (output_buffer) + tmp_recv_buf_arg<void>, // tmp_buf + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>, + stub_arg<main_kernel_args::args_start_index + 10>> { + using processing_type = void; static constexpr const char* specific_name() { return "ring_reduce_scatter_ipc"; } - using tmp_recv_buf_arg = typename ring_reduce_scatter_kernel<kernel_params>::tmp_recv_buf_arg; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = - typename ring_reduce_scatter_kernel<kernel_params>::income_data_flag_arg; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = - typename ring_reduce_scatter_kernel<kernel_params>::ready_to_recv_flag_arg; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; + + using base = base_ipc_kernel<ipc_kernel, + stub_arg<main_kernel_args::args_start_index>, + stub_arg<main_kernel_args::args_start_index + 1>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, + income_data_flag_arg, + ready_to_recv_flag_arg, + stub_arg<main_kernel_args::args_start_index + 6>, + stub_arg<main_kernel_args::args_start_index + 7>, + stub_arg<main_kernel_args::args_start_index + 8>, + stub_arg<main_kernel_args::args_start_index + 9>, + stub_arg<main_kernel_args::args_start_index + 10>>; + + template <class ipc_handles_t> + void bind_data(const ipc_handles_t& ipc_handles) { + auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(0).get().pointer); + this->template set_arg<recv_buf_arg<processing_type>>(recv_buf); + + auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>( + ipc_handles.at(1).get().pointer); + this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf); + + auto income_data_flag = + reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(2).get().pointer); + this->template set_arg<income_data_flag_arg>(income_data_flag); + + auto ready_to_recv_flag = + reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(3).get().pointer); + this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag); + } - using base = execution_kernel<ring_reduce_scatter_ipc<kernel_params>, - stub_arg<main_kernel_args::args_start_index>, - stub_arg<main_kernel_args::args_start_index + 1>, - stub_arg<main_kernel_args::args_start_index + 2>, - tmp_recv_buf_arg, - income_data_flag_arg, - ready_to_recv_flag_arg, - stub_arg<main_kernel_args::args_start_index + 6>, - stub_arg<main_kernel_args::args_start_index + 7>, - stub_arg<main_kernel_args::args_start_index + 8>, - stub_arg<main_kernel_args::args_start_index + 9>>; + using base::base; }; -template <class kernel_params> -struct ring_reduce_scatter_scale_out_cpu_gw_kernel - : public execution_kernel< - ring_reduce_scatter_scale_out_cpu_gw_kernel<typename kernel_params::native_type>, - arg<main_kernel_args::args_start_index, size_t>, - arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>, - arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 3, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 4, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 5, int*>, - arg<main_kernel_args::args_start_index + 6, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 7, - typename kernel_params::native_type*>, - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>, - arg<main_kernel_args::args_start_index + 10, size_t>, - thread_safe_arg<main_kernel_args::args_start_index + 11, - typename kernel_params::native_type*>> { - using param_t = kernel_params; - using processing_type = typename param_t::native_type; +struct scale_out_cpu_gw_kernel + : public execution_kernel<scale_out_cpu_gw_kernel, + send_buf_size_arg, // recv_count + send_buf_arg<void>, // send_buf + recv_buf_arg<void>, // recv_buf (output_buffer) + tmp_recv_buf_arg<void>, // tmp_buf + income_data_flag_arg, // left_wrote_to_me_flag + ready_to_recv_flag_arg, // i_ready_to_receive_flag + local_barrier_flag_arg, // local_barrier_flag + right_output_buf_arg<void>, // right_output_buffer + right_tmp_recv_buf_arg<void>, // right_temp_buffer + right_income_data_flag_arg, // i_send_to_right_flag + right_ready_to_recv_flag_arg, // right_ready_to_recv_flag + + // scaleout-specific args + permanent_arg<main_kernel_args::args_start_index + 11, size_t>, + permanent_arg<main_kernel_args::args_start_index + 12, void*>> { + using processing_type = void; static constexpr const char* specific_name() { return "reduce_scatter_execution_scale_out_cpu_gw"; } - //own - using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>; - using send_buf_size_arg_type = typename send_buf_size_arg::arg_type; - - using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>; - using send_buf_arg_type = typename send_buf_arg::arg_type; - - using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>; - using recv_buf_arg_type = typename recv_buf_arg::arg_type; - - using tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>; - using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type; - - using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>; - using income_data_flag_arg_type = typename income_data_flag_arg::arg_type; - - using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>; - using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type; - - using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>; - using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type; - - //right - using right_tmp_recv_buf_arg = - thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>; - using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type; - - /* using right_recv_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, void *>; - using right_recv_buf_arg_type = typename right_recv_buf_arg::arg_type; -*/ - using right_income_data_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 8, int*>; - using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type; - - using right_ready_to_recv_flag_arg = - thread_safe_arg<main_kernel_args::args_start_index + 9, int*>; - using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type; + using common_entry_buf_size_arg = send_buf_size_arg; + using common_entry_buf_arg = send_buf_arg<processing_type>; // event data - using event_prod_chunk_mem_arg = - thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>; + using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>; using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type; - using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>; + using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>; using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type; - using base = execution_kernel<ring_reduce_scatter_scale_out_cpu_gw_kernel<kernel_params>, + using base = execution_kernel<scale_out_cpu_gw_kernel, send_buf_size_arg, - send_buf_arg, - recv_buf_arg, - tmp_recv_buf_arg, + send_buf_arg<processing_type>, + recv_buf_arg<processing_type>, + tmp_recv_buf_arg<processing_type>, income_data_flag_arg, ready_to_recv_flag_arg, local_barrier_flag_arg, - right_tmp_recv_buf_arg, + right_output_buf_arg<processing_type>, + right_tmp_recv_buf_arg<processing_type>, right_income_data_flag_arg, right_ready_to_recv_flag_arg, event_prod_chunk_mem_arg, event_prod_bytes_arg>; + + template <class ctx_params_t> + void bind_data(const ctx_params_t& out_ctx_params) { + // TODO not implemented + (void)out_ctx_params; + throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type"); + } + + using base::base; }; +} // namespace reduce_scatter +} // namespace ring } // namespace native diff --git a/src/common/comm/l0/modules/supported_modules.hpp b/src/common/comm/l0/modules/supported_modules.hpp index 30adb80f4..6c6b0348b 100644 --- a/src/common/comm/l0/modules/supported_modules.hpp +++ b/src/common/comm/l0/modules/supported_modules.hpp @@ -47,4 +47,6 @@ using supported_topology_device_modules = std::tuple< template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type> class module_impl> using supported_device_modules = supported_topology_device_modules<module_impl, CCL_COLL_TYPE_LIST>; + +using supported_device_modules1 = std::array<int, 1>; } // namespace native diff --git a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp index 4373120cd..612a62306 100644 --- a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp +++ b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp @@ -30,10 +30,10 @@ #include "common/comm/l0/context/device_storage.hpp" /*REFACTORING*/ -#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp" +#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp" +#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp" +#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp" /*REFACTORING*/ namespace native { diff --git a/src/common/comm/single_device_communicator/single_device_communicator.cpp b/src/common/comm/single_device_communicator/single_device_communicator.cpp index 69c32ceb0..a24f4a803 100644 --- a/src/common/comm/single_device_communicator/single_device_communicator.cpp +++ b/src/common/comm/single_device_communicator/single_device_communicator.cpp @@ -80,7 +80,7 @@ ccl::event single_device_communicator::barrier(const ccl::stream::impl_value_t& const ccl::vector_class<ccl::event>& deps) { // TODO what exactly we need to do with 'attr' here? - ccl_barrier_impl(comm_impl.get(), op_stream.get()); + ccl_barrier_impl(comm_impl.get(), op_stream.get(), deps); // TODO what exactly we need to return here? ccl_barrier_impl() is void func return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(nullptr)); @@ -96,23 +96,6 @@ ccl::event single_device_communicator::allgatherv_base_impl( const ccl::stream::impl_value_t& stream, const ccl_coll_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>( new ccl::host_event_impl(ccl_allgatherv_impl(send_buf, send_count, @@ -121,7 +104,8 @@ ccl::event single_device_communicator::allgatherv_base_impl( dtype, attr, comm_impl.get(), - stream_handle)))); + stream.get(), + deps)))); } ccl::event single_device_communicator::allgatherv_impl(const void* send_buf, @@ -166,25 +150,8 @@ ccl::event single_device_communicator::allreduce_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_allreduce_impl( - send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream_handle)))); + send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream.get(), deps)))); } /* alltoall */ @@ -195,25 +162,8 @@ ccl::event single_device_communicator::alltoall_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::alltoall_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_alltoall_impl( - send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream_handle)))); + send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream.get(), deps)))); } ccl::event single_device_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf, @@ -236,23 +186,6 @@ ccl::event single_device_communicator::alltoallv_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::alltoallv_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>( new ccl::host_event_impl(ccl_alltoallv_impl(send_buf, send_counts.data(), @@ -261,7 +194,8 @@ ccl::event single_device_communicator::alltoallv_impl(const void* send_buf, dtype, attr, comm_impl.get(), - stream_handle)))); + stream.get(), + deps)))); } ccl::event single_device_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf, const ccl::vector_class<size_t>& send_counts, @@ -283,25 +217,8 @@ ccl::event single_device_communicator::broadcast_impl(void* buf, const ccl::stream::impl_value_t& stream, const ccl::broadcast_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl( - ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream_handle)))); + ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream.get(), deps)))); } /* reduce */ @@ -314,25 +231,17 @@ ccl::event single_device_communicator::reduce_impl(const void* send_buf, const ccl::stream::impl_value_t& stream, const ccl::reduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - - return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_impl( - send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), stream_handle)))); + return ccl::event( + std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_impl(send_buf, + recv_buf, + count, + dtype, + reduction, + root, + attr, + comm_impl.get(), + stream.get(), + deps)))); } /* reduce_scatter */ @@ -345,26 +254,16 @@ ccl::event single_device_communicator::reduce_scatter_impl( const ccl::stream::impl_value_t& stream, const ccl::reduce_scatter_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { (void*)send_buf, recv_buf }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - - return ccl::event(std::unique_ptr< - ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_scatter_impl( - send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), stream_handle)))); + return ccl::event(std::unique_ptr<ccl::event_impl>( + new ccl::host_event_impl(ccl_reduce_scatter_impl(send_buf, + recv_buf, + recv_count, + dtype, + reduction, + attr, + comm_impl.get(), + stream.get(), + deps)))); } /* sparse_allreduce */ @@ -383,25 +282,6 @@ ccl::event single_device_communicator::sparse_allreduce_impl( const ccl::stream::impl_value_t& stream, const ccl::sparse_allreduce_attr& attr, const ccl::vector_class<ccl::event>& deps) { - using namespace ::native::detail; - - std::vector<void*> bufs = { - (void*)send_ind_buf, (void*)send_val_buf, recv_ind_buf, recv_val_buf - }; - auto mode = check_assoc_device_memory(bufs, get_device(), get_context()); - - const ccl_stream* stream_handle = nullptr; - - if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) { - } - else if (mode == usm_support_mode::need_conversion) -#ifdef CCL_ENABLE_SYCL - stream_handle = stream.get(); -#else - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - " - USM convertation is not supported for such configuration"); -#endif - return ccl::event(std::unique_ptr<ccl::event_impl>( new ccl::host_event_impl(ccl_sparse_allreduce_impl(send_ind_buf, send_ind_count, @@ -416,7 +296,8 @@ ccl::event single_device_communicator::sparse_allreduce_impl( reduction, attr, comm_impl.get(), - stream_handle)))); + stream.get(), + deps)))); } COMM_INTERFACE_COLL_INSTANTIATION(single_device_communicator); diff --git a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp index 8431bd8dd..a57ca84c9 100644 --- a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp +++ b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp @@ -93,7 +93,9 @@ ccl::event single_device_communicator::allgatherv_impl(const buffer_type& send_b ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - stream.get()); + stream.get(), + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } template <class buffer_type> @@ -143,7 +145,9 @@ ccl::event single_device_communicator::allreduce_impl(const buffer_type& send_bu reduction, attr, comm_impl.get(), - stream.get()); + stream.get(), + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -184,7 +188,9 @@ ccl::event single_device_communicator::alltoall_impl(const buffer_type& send_buf ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - stream.get()); + stream.get(), + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -247,7 +253,9 @@ ccl::event single_device_communicator::alltoallv_impl(const buffer_type& send_bu ccl::native_type_info<buffer_type>::dtype, attr, comm_impl.get(), - stream.get()); + stream.get(), + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -289,7 +297,9 @@ ccl::event single_device_communicator::broadcast_impl(buffer_type& buf, root, attr, comm_impl.get(), - stream.get()); + stream.get(), + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -333,7 +343,9 @@ ccl::event single_device_communicator::reduce_impl(const buffer_type& send_buf, root, attr, comm_impl.get(), - stream_ptr); + stream_ptr, + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -374,7 +386,9 @@ ccl::event single_device_communicator::reduce_scatter_impl( reduction, attr, comm_impl.get(), - stream_ptr); + stream_ptr, + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } @@ -439,6 +453,8 @@ ccl::event single_device_communicator::sparse_allreduce_impl( reduction, attr, comm_impl.get(), - stream_ptr); + stream_ptr, + deps, + true); return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req)); } diff --git a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp index ab16a1679..523abb41b 100644 --- a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp +++ b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp @@ -44,128 +44,11 @@ struct allreduce_usm_visitor { ccl::to_string(dtype), " , handle: ", utils::enum_to_underlying(dtype)); + req = get_self()->template allreduce_impl<uint8_t>((const uint8_t*)(const void*)send_buf, + (uint8_t*)(void*)recv_buf, + count, + std::forward<Args>(args)...); - switch (dtype) { - case ccl::datatype::int8: { - using type = int8_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::uint8: { - using type = uint8_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::int16: { - using type = int16_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::uint16: { - using type = uint16_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::int32: { - using type = int32_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::uint32: { - using type = uint32_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::int64: { - using type = int64_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::uint64: { - using type = uint64_t; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::float16: { - using type = ccl::float16; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::float32: { - using type = float; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::float64: { - using type = double; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - case ccl::datatype::bfloat16: { - using type = ccl::bfloat16; - req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf), - static_cast<type*>(recv_buf), - count, - std::forward<Args>(args)...); - processed = true; - break; - } - default: { - CCL_THROW("unknown datatype ", dtype); - LOG_DEBUG("comm: ", - /*get_self()->to_string(),*/ - " - no found visitor for datatype: ", - ccl::to_string(dtype), - " , handle: ", - utils::enum_to_underlying(dtype), - ", use RAW types"); - break; - } - } return processed; } }; diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp index c311870aa..849b363ca 100644 --- a/src/common/datatype/datatype.cpp +++ b/src/common/datatype/datatype.cpp @@ -90,18 +90,18 @@ ccl_datatype_storage::ccl_datatype_storage() { : 0; CCL_ASSERT(size > 0, "Unexpected data type size: ", size, ", for idx: ", idx); - name_str = (idx == ccl::datatype::int8) ? "INT8" - : (idx == ccl::datatype::uint8) ? "UINT8" - : (idx == ccl::datatype::int16) ? "INT16" - : (idx == ccl::datatype::uint16) ? "UINT16" - : (idx == ccl::datatype::int32) ? "INT32" - : (idx == ccl::datatype::uint32) ? "UINT32" - : (idx == ccl::datatype::int64) ? "INT64" - : (idx == ccl::datatype::uint64) ? "UINT64" - : (idx == ccl::datatype::float16) ? "FP16" - : (idx == ccl::datatype::float32) ? "FP32" - : (idx == ccl::datatype::float64) ? "FP64" - : (idx == ccl::datatype::bfloat16) ? "BF16" + name_str = (idx == ccl::datatype::int8) ? "int8" + : (idx == ccl::datatype::uint8) ? "uint8" + : (idx == ccl::datatype::int16) ? "int16" + : (idx == ccl::datatype::uint16) ? "uint16" + : (idx == ccl::datatype::int32) ? "int32" + : (idx == ccl::datatype::uint32) ? "uint32" + : (idx == ccl::datatype::int64) ? "int64" + : (idx == ccl::datatype::uint64) ? "uint64" + : (idx == ccl::datatype::float16) ? "float16" + : (idx == ccl::datatype::float32) ? "float32" + : (idx == ccl::datatype::float64) ? "float64" + : (idx == ccl::datatype::bfloat16) ? "bfloat16" : 0; create_internal(predefined_table, idx, size, name_str); diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp index f612aeb5d..e9ffa8f88 100644 --- a/src/common/env/env.cpp +++ b/src/common/env/env.cpp @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include <climits> #include <dlfcn.h> #include <iterator> +#include <memory> #include <sstream> #include <unistd.h> @@ -44,6 +46,12 @@ std::map<ccl_staging_buffer, std::string> env_data::staging_buffer_names = { std::make_pair(ccl_staging_usm, "usm") }; +std::map<atl_mnic_t, std::string> env_data::mnic_type_names = { + std::make_pair(ATL_MNIC_NONE, "none"), + std::make_pair(ATL_MNIC_LOCAL, "local"), + std::make_pair(ATL_MNIC_GLOBAL, "global") +}; + env_data::env_data() : was_printed(false), @@ -58,8 +66,13 @@ env_data::env_data() atl_transport(ccl_atl_mpi), enable_shm(0), - sync_coll(0), - extra_ep(0), + enable_rma(0), + enable_device_buf(0), + enable_sync_coll(0), + enable_extra_ep(0), + + mnic_type(ATL_MNIC_NONE), + mnic_count(CCL_ENV_SIZET_NOT_SPECIFIED), enable_unordered_coll(0), @@ -69,14 +82,13 @@ env_data::env_data() fusion_check_urgent(1), fusion_cycle_ms(0.2), - enable_rma(0), priority_mode(ccl_priority_none), spin_count(100), yield_type(ccl_yield_pause), max_short_size(0), bcast_part_count(CCL_ENV_SIZET_NOT_SPECIFIED), cache_key_type(ccl_cache_key_match_id), - enable_cache_flush(1), + enable_cache_flush(0), enable_strict_order(0), staging_buffer(ccl_staging_usm), @@ -95,6 +107,7 @@ env_data::env_data() enable_comm_kernels(0), comm_kernels_path(), + comm_kernels_debug(0), gpu_thread_count(CCL_ENV_SIZET_NOT_SPECIFIED), bf16_impl_type(ccl_bf16_no_compiler_support), @@ -123,8 +136,8 @@ void env_data::parse() { if (fw_type == ccl_framework_horovod) { worker_wait = 1; - sync_coll = 1; - extra_ep = 1; + enable_sync_coll = 1; + enable_extra_ep = 1; yield_type = ccl_yield_sched_yield; } @@ -135,8 +148,16 @@ void env_data::parse() { env_2_atl_transport(); env_2_type(CCL_ATL_SHM, enable_shm); - env_2_type(CCL_ATL_SYNC_COLL, sync_coll); - env_2_type(CCL_ATL_EXTRA_EP, extra_ep); + env_2_type(CCL_ATL_RMA, enable_rma); + env_2_type(CCL_ATL_DEVICE_BUF, enable_device_buf); + env_2_type(CCL_ATL_SYNC_COLL, enable_sync_coll); + env_2_type(CCL_ATL_EXTRA_EP, enable_extra_ep); + + env_2_enum(CCL_MNIC, mnic_type_names, mnic_type); + env_2_type(CCL_MNIC_COUNT, mnic_count); + if (mnic_count == CCL_ENV_SIZET_NOT_SPECIFIED) { + mnic_count = worker_count; + } env_2_type(CCL_ALLGATHERV, allgatherv_algo_raw); env_2_type(CCL_ALLREDUCE, allreduce_algo_raw); @@ -176,7 +197,6 @@ void env_data::parse() { if (worker_wait) spin_count = 1000; - env_2_type(CCL_RMA, enable_rma); env_2_enum(CCL_PRIORITY, priority_mode_names, priority_mode); env_2_type(CCL_SPIN_COUNT, spin_count); env_2_enum(CCL_YIELD, ccl_yield_type_names, yield_type); @@ -219,29 +239,7 @@ void env_data::parse() { CCL_THROW_IF_NOT(!ccl_root.empty(), "incorrect comm kernels path, CCL_ROOT not found!"); comm_kernels_path = ccl_root + "/lib/kernels/"; } - - // TODO remove IPC workaround knobs - if (!getenv("DisableStatelessToStatefulOptimization")) { - setenv("DisableStatelessToStatefulOptimization", "1", 1); - LOG_WARN( - "environment variable 'DisableStatelessToStatefulOptimization' is not set, will be used DisableStatelessToStatefulOptimization=1"); - } - if (!getenv("CFESingleSliceDispatchCCSMode")) { - setenv("CFESingleSliceDispatchCCSMode", "1", 1); - LOG_WARN( - "environment variable 'CFESingleSliceDispatchCCSMode' is not set, will be used CFESingleSliceDispatchCCSMode=1"); - } - if (!getenv("OverrideStatelessMocsIndex")) { - setenv("OverrideStatelessMocsIndex", "2", 1); - LOG_WARN( - "environment variable 'OverrideStatelessMocsIndex' is not set, will be used OverrideStatelessMocsIndex=2"); - } - - if (!getenv("CCL_KVS_GET_TIMEOUT")) { - setenv("CCL_KVS_GET_TIMEOUT", "10", 1); - LOG_WARN( - "environment variable 'CCL_KVS_GET_TIMEOUT' is not set, will be used CCL_KVS_GET_TIMEOUT=10"); - } + env_2_type(CCL_COMM_KERNELS_DEBUG, comm_kernels_debug); } env_2_type(CCL_GPU_THREAD_COUNT, gpu_thread_count); @@ -278,8 +276,25 @@ void env_data::print(int rank) { else was_printed = true; - auto& global_data = ccl::global_data::get(); + if (rank == 0) { + auto version = utils::get_library_version(); + LOG_INFO("library version: ", version.full); + LOG_INFO("specification version: ", ONECCL_SPEC_VERSION); +#ifdef CCL_ENABLE_SYCL + LOG_INFO("compute backend: ", version.cl_backend_name); +#endif /* CCL_ENABLE_SYCL */ +#ifdef ENABLE_DEBUG + const char* build_mode = "debug"; +#else /* ENABLE_DEBUG */ + const char* build_mode = "release"; +#endif /* ENABLE_DEBUG */ + LOG_INFO("build mode: ", build_mode); + LOG_INFO("C compiler: ", CCL_C_COMPILER); + LOG_INFO("C++ compiler: ", CCL_CXX_COMPILER); + } + + auto& global_data = ccl::global_data::get(); auto local_proc_idx = global_data.executor->get_local_proc_idx(); auto local_proc_count = global_data.executor->get_local_proc_count(); @@ -311,8 +326,13 @@ void env_data::print(int rank) { LOG_INFO(CCL_ATL_TRANSPORT, ": ", str_by_enum(atl_transport_names, atl_transport)); LOG_INFO(CCL_ATL_SHM, ": ", enable_shm); - LOG_DEBUG(CCL_ATL_SYNC_COLL, ": ", sync_coll); - LOG_DEBUG(CCL_ATL_EXTRA_EP, ": ", extra_ep); + LOG_INFO(CCL_ATL_RMA, ": ", enable_rma); + LOG_INFO(CCL_ATL_DEVICE_BUF, ": ", enable_device_buf); + LOG_DEBUG(CCL_ATL_SYNC_COLL, ": ", enable_sync_coll); + LOG_DEBUG(CCL_ATL_EXTRA_EP, ": ", enable_extra_ep); + + LOG_INFO(CCL_MNIC, ": ", str_by_enum(mnic_type_names, mnic_type)); + LOG_INFO(CCL_MNIC_COUNT, ": ", mnic_count); LOG_INFO(CCL_ALLGATHERV, ": ", @@ -349,7 +369,6 @@ void env_data::print(int rank) { LOG_INFO(CCL_FUSION_CHECK_URGENT, ": ", fusion_check_urgent); LOG_INFO(CCL_FUSION_CYCLE_MS, ": ", fusion_cycle_ms); - LOG_INFO(CCL_RMA, ": ", enable_rma); LOG_INFO(CCL_PRIORITY, ": ", str_by_enum(priority_mode_names, priority_mode)); LOG_INFO(CCL_SPIN_COUNT, ": ", spin_count); LOG_INFO(CCL_YIELD, ": ", str_by_enum(ccl_yield_type_names, yield_type)); @@ -384,33 +403,21 @@ void env_data::print(int rank) { : CCL_ENV_STR_NOT_SPECIFIED); LOG_INFO(CCL_ALLTOALL_SCATTER_PLAIN, ": ", alltoall_scatter_plain); +#ifdef CCL_ENABLE_SYCL LOG_INFO(CCL_COMM_KERNELS, ": ", enable_comm_kernels); LOG_INFO(CCL_COMM_KERNELS_PATH, ": ", (!comm_kernels_path.empty()) ? comm_kernels_path : CCL_ENV_STR_NOT_SPECIFIED); + LOG_INFO(CCL_COMM_KERNELS_DEBUG, ": ", comm_kernels_debug); LOG_INFO(CCL_GPU_THREAD_COUNT, ": ", (gpu_thread_count != CCL_ENV_SIZET_NOT_SPECIFIED) ? std::to_string(gpu_thread_count) : CCL_ENV_STR_NOT_SPECIFIED); +#endif /* CCL_ENABLE_SYCL */ LOG_INFO(CCL_BF16, ": ", str_by_enum(bf16_impl_names, bf16_impl_type)); LOG_INFO(CCL_FP16, ": ", str_by_enum(fp16_impl_names, fp16_impl_type)); -#ifdef ENABLE_DEBUG - const char* build_mode = "debug"; -#else - const char* build_mode = "release"; -#endif - LOG_INFO("build mode: ", build_mode); - - LOG_INFO("C compiler: ", CCL_C_COMPILER); - LOG_INFO("C++ compiler: ", CCL_CXX_COMPILER); - - auto version = utils::get_library_version(); - LOG_INFO("library version: ", version.full); - - LOG_INFO("specification version: ", ONECCL_SPEC_VERSION); - char* ccl_root = getenv("CCL_ROOT"); LOG_INFO("CCL_ROOT: ", (ccl_root) ? ccl_root : CCL_ENV_STR_NOT_SPECIFIED); @@ -430,6 +437,9 @@ void env_data::print(int rank) { void env_data::set_internal_env() { auto attr = ccl_executor::generate_atl_attr(*this); atl_wrapper::set_internal_env(attr); + if (log_level >= ccl_log_level::info) { + setenv("I_MPI_DEBUG", "4", 0); + } } int env_data::env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_per_process) { @@ -487,35 +497,59 @@ int env_data::env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_p return 1; } +int env_data::parse_core_id(const std::string& core_id_str, size_t& result) { + char* end_ptr; + const char* core_id_str_ptr = core_id_str.c_str(); + + errno = 0; + auto core_id = std::strtol(core_id_str_ptr, &end_ptr, 10); + + if ((errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN)) || + (errno != 0 && core_id == 0)) { + LOG_ERROR("core id value is invalid in string: ", core_id_str); + return 0; + } + if (end_ptr == core_id_str_ptr) { + LOG_ERROR("no digits were found in string: ", core_id_str); + return 0; + } + if (core_id < 0) { + LOG_ERROR( + "core id cannot be less than zero but got ", core_id, " in string: ", core_id_str); + return 0; + } + result = core_id; + return 1; +} + int env_data::env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_count) { CCL_THROW_IF_NOT(local_proc_count > 0); - int read_env = 0; - size_t w_idx, read_count = 0; - char* affinity_copy = nullptr; + size_t idx; + std::unique_ptr<char> affinity_copy; char* affinity_to_parse = getenv(CCL_WORKER_AFFINITY); - char* proc_id_str; + char* core_range_str; char* tmp; - size_t proccessor_count; + size_t system_core_count; size_t affinity_size = local_proc_count * worker_count; - worker_affinity.assign(affinity_size, 0); if (!affinity_to_parse || (strlen(affinity_to_parse) == 0) || (strcmp(affinity_to_parse, "auto") == 0)) { + worker_affinity.assign(affinity_size, 0); if (std::getenv(I_MPI_AVAILABLE_CORES_ENV)) { /* generate auto affinity based on IMPI process pinning */ return env_2_worker_affinity_auto(local_proc_idx, worker_count); } else { /* generate auto affinity as last N cores */ - proccessor_count = sysconf(_SC_NPROCESSORS_ONLN); - for (w_idx = 0; w_idx < affinity_size; w_idx++) { - if (w_idx < proccessor_count) { - worker_affinity[w_idx] = proccessor_count - w_idx - 1; + system_core_count = sysconf(_SC_NPROCESSORS_ONLN); + for (idx = 0; idx < affinity_size; idx++) { + if (idx < system_core_count) { + worker_affinity[idx] = system_core_count - idx - 1; } else { - worker_affinity[w_idx] = worker_affinity[w_idx % proccessor_count]; + worker_affinity[idx] = worker_affinity[idx % system_core_count]; } } return 1; @@ -524,47 +558,58 @@ int env_data::env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_cou /* create copy of original buffer because it will be modified in strsep */ size_t affinity_len = strlen(affinity_to_parse); - affinity_copy = static_cast<char*>(CCL_CALLOC(affinity_len + 1, "affinity_copy")); - CCL_MEMCPY(affinity_copy, affinity_to_parse, affinity_len); - tmp = affinity_copy; - - for (w_idx = 0; w_idx < affinity_size; w_idx++) { - proc_id_str = strsep(&tmp, ","); - if (proc_id_str != NULL) { - if (atoi(proc_id_str) < 0) { - LOG_ERROR( - "unexpected proc_id ", proc_id_str, ", affinity string ", affinity_to_parse); - read_env = 0; - CCL_FREE(affinity_copy); - return read_env; - } - worker_affinity[w_idx] = std::strtoul(proc_id_str, nullptr, 10); - read_count++; + affinity_copy = + std::unique_ptr<char>(static_cast<char*>(CCL_CALLOC(affinity_len + 1, "affinity_copy"))); + CCL_MEMCPY(affinity_copy.get(), affinity_to_parse, affinity_len); + tmp = affinity_copy.get(); + + while (tmp) { + core_range_str = strsep(&tmp, ","); + if (!core_range_str) { + break; } - else { - LOG_ERROR("unexpected end of affinity string, expected ", - affinity_size, - " numbers, read ", - read_count, - ", affinity string ", - affinity_to_parse); - read_env = 0; - CCL_FREE(affinity_copy); - return read_env; + + auto core_range = tokenize<std::vector<std::string>>(std::string(core_range_str), '-'); + + if ((core_range.size() != 2) && (core_range.size() != 1)) { + LOG_ERROR( + "unexpected format in affinity: ", + affinity_to_parse, + ", specify core range using <first_core>-<last_core> or single core using <core>"); + return 0; + } + + if (core_range.size() == 1) { + /* to unify logic below */ + core_range.push_back(*core_range.begin()); + } + + CCL_ASSERT(core_range.size() == 2, "unexpected number of cores in range"); + + size_t first_core, last_core; + if (!parse_core_id(core_range[0], first_core) || !parse_core_id(core_range[1], last_core)) { + return 0; + } + + if (first_core > last_core) { + LOG_ERROR("unexpected first and last cores in range: ", + core_range_str, + ", first core should be less or equal to last core"); + return 0; + } + + for (idx = first_core; idx <= last_core; idx++) { + worker_affinity.push_back(idx); } } - if (read_count < affinity_size) { - LOG_ERROR( - "unexpected number of processors (specify 1 logical processor per 1 worker thread), affinity string ", - affinity_to_parse); - read_env = 0; - CCL_FREE(affinity_copy); - return read_env; - } - read_env = 1; - CCL_FREE(affinity_copy); - return read_env; + if (worker_affinity.size() < affinity_size) { + LOG_ERROR("unexpected number of cores in affinity: ", + affinity_to_parse, + ", specify 1 core per 1 worker thread"); + return 0; + } + return 1; } void env_data::env_2_atl_transport() { diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp index 102000187..00e708de2 100644 --- a/src/common/env/env.hpp +++ b/src/common/env/env.hpp @@ -49,9 +49,14 @@ constexpr const char* I_MPI_AVAILABLE_CORES_DELIMS = ",x"; constexpr const char* CCL_ATL_TRANSPORT = "CCL_ATL_TRANSPORT"; constexpr const char* CCL_ATL_SHM = "CCL_ATL_SHM"; +constexpr const char* CCL_ATL_RMA = "CCL_ATL_RMA"; +constexpr const char* CCL_ATL_DEVICE_BUF = "CCL_ATL_DEVICE_BUF"; constexpr const char* CCL_ATL_SYNC_COLL = "CCL_ATL_SYNC_COLL"; constexpr const char* CCL_ATL_EXTRA_EP = "CCL_ATL_EXTRA_EP"; +constexpr const char* CCL_MNIC = "CCL_MNIC"; +constexpr const char* CCL_MNIC_COUNT = "CCL_MNIC_COUNT"; + constexpr const char* CCL_ALLGATHERV = "CCL_ALLGATHERV"; constexpr const char* CCL_ALLREDUCE = "CCL_ALLREDUCE"; constexpr const char* CCL_ALLTOALL = "CCL_ALLTOALL"; @@ -69,7 +74,6 @@ constexpr const char* CCL_FUSION_COUNT_THRESHOLD = "CCL_FUSION_COUNT_THRESHOLD"; constexpr const char* CCL_FUSION_CHECK_URGENT = "CCL_FUSION_CHECK_URGENT"; constexpr const char* CCL_FUSION_CYCLE_MS = "CCL_FUSION_CYCLE_MS"; -constexpr const char* CCL_RMA = "CCL_RMA"; constexpr const char* CCL_PRIORITY = "CCL_PRIORITY"; constexpr const char* CCL_SPIN_COUNT = "CCL_SPIN_COUNT"; constexpr const char* CCL_YIELD = "CCL_YIELD"; @@ -95,32 +99,17 @@ constexpr const char* CCL_ALLTOALL_SCATTER_PLAIN = "CCL_ALLTOALL_SCATTER_PLAIN"; constexpr const char* CCL_COMM_KERNELS = "CCL_COMM_KERNELS"; constexpr const char* CCL_COMM_KERNELS_PATH = "CCL_COMM_KERNELS_PATH"; +constexpr const char* CCL_COMM_KERNELS_DEBUG = "CCL_COMM_KERNELS_DEBUG"; constexpr const char* CCL_GPU_THREAD_COUNT = "CCL_GPU_THREAD_COUNT"; constexpr const char* CCL_BF16 = "CCL_BF16"; constexpr const char* CCL_FP16 = "CCL_FP16"; -enum ccl_priority_mode { - ccl_priority_none, - ccl_priority_direct, - ccl_priority_lifo, - - ccl_priority_last_value -}; - -enum ccl_atl_transport { - ccl_atl_ofi, - ccl_atl_mpi, - - ccl_atl_last_value -}; +enum ccl_priority_mode { ccl_priority_none, ccl_priority_direct, ccl_priority_lifo }; -enum ccl_staging_buffer { - ccl_staging_regular, - ccl_staging_usm, +enum ccl_atl_transport { ccl_atl_ofi, ccl_atl_mpi }; - ccl_staging_last_value -}; +enum ccl_staging_buffer { ccl_staging_regular, ccl_staging_usm }; namespace ccl { @@ -154,8 +143,13 @@ class env_data { ccl_atl_transport atl_transport; int enable_shm; - int sync_coll; - int extra_ep; + int enable_rma; + int enable_device_buf; + int enable_sync_coll; + int enable_extra_ep; + + atl_mnic_t mnic_type; + ssize_t mnic_count; /* parsing logic can be quite complex @@ -179,7 +173,6 @@ class env_data { int fusion_check_urgent; float fusion_cycle_ms; - int enable_rma; ccl_priority_mode priority_mode; size_t spin_count; ccl_yield_type yield_type; @@ -205,6 +198,7 @@ class env_data { int enable_comm_kernels; std::string comm_kernels_path; + int comm_kernels_debug; ssize_t gpu_thread_count; ccl_bf16_impl_type bf16_impl_type; @@ -280,12 +274,14 @@ class env_data { static std::map<ccl_priority_mode, std::string> priority_mode_names; static std::map<ccl_atl_transport, std::string> atl_transport_names; static std::map<ccl_staging_buffer, std::string> staging_buffer_names; + static std::map<atl_mnic_t, std::string> mnic_type_names; int env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_count); void env_2_atl_transport(); private: int env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_per_process); + int parse_core_id(const std::string& core_id_str, size_t& result); }; } /* namespace ccl */ diff --git a/src/common/event/ccl_event.cpp b/src/common/event/ccl_event.cpp index af63d787b..855536d07 100644 --- a/src/common/event/ccl_event.cpp +++ b/src/common/event/ccl_event.cpp @@ -23,39 +23,6 @@ ccl_event::ccl_event(event_native_t& event, const ccl::library_version& version) command_type_val(), command_execution_status_val() {} -ccl_event::ccl_event(event_native_handle_t event, - event_native_context_t context, - const ccl::library_version& version) - : version(version), - command_type_val(), - command_execution_status_val() { -#ifdef CCL_ENABLE_SYCL - native_event = event_native_t{ event, context }; -#else - //TODO - throw; -#endif -} - -void ccl_event::build_from_params() { - if (!creation_is_postponed) { - throw ccl::exception("error"); - } -#ifdef CCL_ENABLE_SYCL - /* TODO unavailbale?? - event_native_t event_candidate{native_context}; - std::swap(event_candidate, native_event); //TODO USE attributes fro sycl queue construction - */ - - throw ccl::exception("build_from_attr is not availbale for sycl::event"); -#else - - //TODO use attributes - -#endif - creation_is_postponed = false; -} - //Export Attributes typename ccl_event::version_traits_t::type ccl_event::set_attribute_value( typename version_traits_t::type val, @@ -75,11 +42,6 @@ typename ccl_event::native_handle_traits_t::return_type& ccl_event::get_attribut return native_event; } -typename ccl_event::context_traits_t::return_type& ccl_event::get_attribute_value( - const context_traits_t& id) { - return native_context; -} - typename ccl_event::command_type_traits_t::type ccl_event::set_attribute_value( typename command_type_traits_t::type val, const command_type_traits_t& t) { diff --git a/src/common/event/ccl_event.hpp b/src/common/event/ccl_event.hpp index 9185e8912..281fc2263 100644 --- a/src/common/event/ccl_event.hpp +++ b/src/common/event/ccl_event.hpp @@ -33,17 +33,11 @@ class alignas(CACHELINE_SIZE) ccl_event { using event_native_handle_t = typename ccl::unified_event_type::handle_t; using event_native_t = typename ccl::unified_event_type::ccl_native_t; - using event_native_context_handle_t = typename ccl::unified_context_type::handle_t; - using event_native_context_t = typename ccl::unified_context_type::ccl_native_t; - ccl_event() = delete; ccl_event(const ccl_event& other) = delete; ccl_event& operator=(const ccl_event& other) = delete; ccl_event(event_native_t& event, const ccl::library_version& version); - ccl_event(event_native_handle_t event, - event_native_context_t context, - const ccl::library_version& version); ~ccl_event() = default; //Export Attributes @@ -61,10 +55,6 @@ class alignas(CACHELINE_SIZE) ccl_event { typename native_handle_traits_t::return_type& get_attribute_value( const native_handle_traits_t& id); - using context_traits_t = - ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::context>; - typename context_traits_t::return_type& get_attribute_value(const context_traits_t& id); - using command_type_traits_t = ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::command_type>; typename command_type_traits_t::return_type set_attribute_value( @@ -89,7 +79,6 @@ class alignas(CACHELINE_SIZE) ccl_event { private: const ccl::library_version version; event_native_t native_event; - event_native_context_t native_context; bool creation_is_postponed{ false }; typename command_type_traits_t::return_type command_type_val; diff --git a/src/common/framework/framework.hpp b/src/common/framework/framework.hpp index 7dddd4cda..8b00d88ea 100644 --- a/src/common/framework/framework.hpp +++ b/src/common/framework/framework.hpp @@ -16,6 +16,7 @@ #pragma once #include <map> +#include <string> typedef int (*ccl_horovod_init_function)(const int*, int); extern ccl_horovod_init_function horovod_init_function; diff --git a/src/common/request/request.cpp b/src/common/request/request.cpp index b460fd3c9..d34f46fa6 100644 --- a/src/common/request/request.cpp +++ b/src/common/request/request.cpp @@ -27,7 +27,7 @@ ccl_request::~ccl_request() { auto counter = completion_counter.load(std::memory_order_acquire); LOG_DEBUG("delete req ", this, " with counter ", counter); if (counter != 0 && !ccl::global_data::get().is_ft_enabled) { - LOG_ERROR("unexpected completion_counter ", counter); + LOG_WARN("unexpected completion_counter ", counter); } } diff --git a/src/common/utils/tuple.hpp b/src/common/utils/tuple.hpp index 327c274ca..2159324bc 100644 --- a/src/common/utils/tuple.hpp +++ b/src/common/utils/tuple.hpp @@ -122,12 +122,12 @@ void ccl_tuple_for_each_args(specific_tuple&& t, functor&& f, args_t&&... args) } template <typename specific_tuple, size_t cur_index, typename functor, class... FunctionArgs> -void ccl_tuple_for_each_indexed_impl(functor, +void ccl_tuple_for_each_indexed_impl(functor&, std::true_type tuple_finished, const FunctionArgs&... args) {} template <typename specific_tuple, size_t cur_index, typename functor, class... FunctionArgs> -void ccl_tuple_for_each_indexed_impl(functor f, +void ccl_tuple_for_each_indexed_impl(functor& f, std::false_type tuple_not_finished, const FunctionArgs&... args) { using tuple_element_t = typename std::tuple_element<cur_index, specific_tuple>::type; @@ -144,7 +144,7 @@ void ccl_tuple_for_each_indexed_impl(functor f, } template <typename specific_tuple, typename functor, class... FunctionArgs> -void ccl_tuple_for_each_indexed(functor f, const FunctionArgs&... args) { +void ccl_tuple_for_each_indexed(functor& f, const FunctionArgs&... args) { constexpr std::size_t tuple_size = std::tuple_size<typename std::remove_reference<specific_tuple>::type>::value; static_assert(tuple_size != 0, "Nothing to do, tuple is empty"); diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp index 4be5d3c3d..ab3412570 100644 --- a/src/common/utils/utils.hpp +++ b/src/common/utils/utils.hpp @@ -57,8 +57,10 @@ #endif #define CACHELINE_SIZE 64 -#define ONE_MB 1048576 -#define TWO_MB 2097152 + +#define CCL_REG_MSG_ALIGNMENT (4096) +#define CCL_LARGE_MSG_ALIGNMENT (2 * 1024 * 1024) +#define CCL_LARGE_MSG_THRESHOLD (1 * 1024 * 1024) #define CCL_MEMCPY(dest, src, n) std::copy((char*)(src), (char*)(src) + (n), (char*)(dest)) @@ -103,7 +105,10 @@ #define CCL_MALLOC_WRAPPER(size, name) \ ({ \ - void* ptr = CCL_MEMALIGN_IMPL(size, (size < TWO_MB) ? CACHELINE_SIZE : TWO_MB); \ + size_t alignment = CCL_REG_MSG_ALIGNMENT; \ + if (size >= CCL_LARGE_MSG_THRESHOLD) \ + alignment = CCL_LARGE_MSG_ALIGNMENT; \ + void* ptr = CCL_MEMALIGN_IMPL(size, alignment); \ CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \ ptr; \ }) diff --git a/src/comp/bf16/bf16.cpp b/src/comp/bf16/bf16.cpp index 3f46792cc..116e8ec14 100644 --- a/src/comp/bf16/bf16.cpp +++ b/src/comp/bf16/bf16.cpp @@ -54,7 +54,7 @@ void ccl_bf16_reduce(const void* in_buf, void ccl_convert_fp32_to_bf16(const void* src, void* dst) { #ifdef CCL_BF16_AVX512BF_COMPILER if (ccl::global_data::env().bf16_impl_type == ccl_bf16_avx512bf) { - _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src))); + _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src))); } else #endif @@ -120,15 +120,15 @@ void ccl_bf16_reduce(const void* in_buf, void* inout_buf, size_t* out_cnt, ccl::reduction reduction_op) { - CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support"); + CCL_FATAL("BF16 reduction was requested but CCL was compiled w/o BF16 support"); } void ccl_convert_fp32_to_bf16_arrays(void* fp32_buf, void* bf16_buf, size_t count) { - CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support"); + CCL_FATAL("FP32->BF16 conversion was requested but CCL was compiled w/o BF16 support"); } void ccl_convert_bf16_to_fp32_arrays(void* bf16_buf, float* fp32_buf, size_t count) { - CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support"); + CCL_FATAL("BF16->FP32 conversion was requested but CCL was compiled w/o BF16 support"); } #endif /* CCL_BF16_COMPILER */ diff --git a/src/comp/bf16/bf16.hpp b/src/comp/bf16/bf16.hpp index 229841d28..18d6c797b 100644 --- a/src/comp/bf16/bf16.hpp +++ b/src/comp/bf16/bf16.hpp @@ -26,13 +26,13 @@ __attribute__((target("avx512bw,avx512vl"))) void ccl_bf16_reduce(const void* in_buf, size_t in_cnt, void* inout_buf, size_t* out_cnt, ccl::reduction reduction_op); -#else +#else /* CCL_BF16_TARGET_ATTRIBUTES */ void ccl_bf16_reduce(const void* in_buf, size_t in_cnt, void* inout_buf, size_t* out_cnt, ccl::reduction reduction_op); -#endif +#endif /* CCL_BF16_TARGET_ATTRIBUTES */ void ccl_convert_fp32_to_bf16_arrays(void*, void*, size_t); void ccl_convert_bf16_to_fp32_arrays(void*, float*, size_t); @@ -46,7 +46,7 @@ void ccl_convert_fp32_to_bf16(const void* src, void* dst) #else void ccl_convert_fp32_to_bf16(const void* src, void* dst) __attribute__((target("avx512bw"))); #endif -#endif +#endif /* CCL_BF16_TARGET_ATTRIBUTES */ #ifdef CCL_BF16_TARGET_ATTRIBUTES #ifdef CCL_BF16_AVX512BF_COMPILER @@ -55,6 +55,6 @@ void ccl_convert_bf16_to_fp32(const void* src, void* dst) #else void ccl_convert_bf16_to_fp32(const void* src, void* dst) __attribute__((target("avx512bw"))); #endif -#endif +#endif /* CCL_BF16_TARGET_ATTRIBUTES */ #endif /* CCL_BF16_COMPILER */ diff --git a/src/comp/bf16/bf16_intrisics.hpp b/src/comp/bf16/bf16_intrisics.hpp index 4455113c8..e452aab9c 100644 --- a/src/comp/bf16/bf16_intrisics.hpp +++ b/src/comp/bf16/bf16_intrisics.hpp @@ -71,7 +71,7 @@ BF16_INLINE_TARGET_ATTRIBUTE_BW void ccl_fp32_store_as_bf16_avx512f(const void* #ifdef CCL_BF16_AVX512BF_COMPILER BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* src, void* dst) { - _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src))); + _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src))); } #endif @@ -96,9 +96,11 @@ BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* sr if (len == 0) \ return; \ uint16_t mask = ((uint16_t)0xFFFF) >> (CCL_BF16_IN_M256 - len); \ - __m256i vbf16_out; \ - ccl_bf16_reduce_inputs_##impl_type(in, inout, (void*)&vbf16_out, op); \ - _mm256_mask_storeu_epi16(inout, (__mmask16)mask, vbf16_out); \ + __m256i a = _mm256_maskz_loadu_epi16(mask, in); \ + __m256i b = _mm256_maskz_loadu_epi16(mask, inout); \ + __m256i res; \ + ccl_bf16_reduce_inputs_##impl_type(&a, &b, &res, op); \ + _mm256_mask_storeu_epi16(inout, (__mmask16)mask, res); \ } \ \ BF16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_bf16_reduce_impl_##impl_type( \ diff --git a/src/comp/comp.cpp b/src/comp/comp.cpp index f9eb9905a..8bee9f353 100644 --- a/src/comp/comp.cpp +++ b/src/comp/comp.cpp @@ -20,6 +20,11 @@ #include "common/log/log.hpp" #include "common/global/global.hpp" #include "common/utils/enums.hpp" +#include "sched/queue/queue.hpp" + +#ifdef CCL_ENABLE_SYCL +#include <CL/sycl.hpp> +#endif /* CCL_ENABLE_SYCL */ #define CCL_REDUCE(type) \ do { \ @@ -60,14 +65,14 @@ ccl::status ccl_comp_copy(const void* in_buf, return ccl::status::success; } -ccl::status ccl_comp_reduce(const void* in_buf, - size_t in_count, - void* inout_buf, - size_t* out_count, - const ccl_datatype& dtype, - ccl::reduction reduction, - ccl::reduction_fn reduction_fn, - const ccl::fn_context* context) { +ccl::status ccl_comp_reduce_regular(const void* in_buf, + size_t in_count, + void* inout_buf, + size_t* out_count, + const ccl_datatype& dtype, + ccl::reduction reduction, + ccl::reduction_fn reduction_fn, + const ccl::fn_context* context) { if (reduction == ccl::reduction::custom) { CCL_THROW_IF_NOT(reduction_fn, "custom reduction requires user callback"); reduction_fn(in_buf, in_count, inout_buf, out_count, dtype.idx(), context); @@ -97,6 +102,76 @@ ccl::status ccl_comp_reduce(const void* in_buf, return ccl::status::success; } +ccl::status ccl_comp_reduce(ccl_sched* sched, + const void* in_buf, + size_t in_count, + void* inout_buf, + size_t* out_count, + const ccl_datatype& dtype, + ccl::reduction reduction, + ccl::reduction_fn reduction_fn, + const ccl::fn_context* context) { +#ifdef CCL_ENABLE_SYCL + ccl_stream* stream = (ccl_stream*)sched->coll_param.stream; + + if (!stream) { + return ccl_comp_reduce_regular( + in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context); + } + + sycl::queue* q = stream->get_native_stream(sched->queue->get_idx()); + CCL_THROW_IF_NOT(q, "null sycl queue"); + auto in_ptr_type = sycl::get_pointer_type(in_buf, q->get_context()); + auto inout_ptr_type = sycl::get_pointer_type(inout_buf, q->get_context()); + + LOG_DEBUG("in_ptr_type: ", + native::detail::usm_to_string(in_ptr_type), + ", inout_ptr_type: ", + native::detail::usm_to_string(inout_ptr_type), + ", native_stream: ", + stream->to_string(), + ", in_count: ", + in_count) + + if ((in_ptr_type != sycl::usm::alloc::device) && (inout_ptr_type != sycl::usm::alloc::device)) { + return ccl_comp_reduce_regular( + in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context); + } + + void* host_in_buf = (void*)in_buf; + void* host_inout_buf = inout_buf; + size_t bytes = in_count * dtype.size(); + + if (in_ptr_type == sycl::usm::alloc::device) { + host_in_buf = CCL_MALLOC(bytes, "host_in_buf"); + q->memcpy(host_in_buf, in_buf, bytes).wait(); + } + + if (inout_ptr_type == sycl::usm::alloc::device) { + host_inout_buf = CCL_MALLOC(bytes, "host_inout_buf"); + q->memcpy(host_inout_buf, inout_buf, bytes).wait(); + } + + ccl_comp_reduce_regular( + host_in_buf, in_count, host_inout_buf, out_count, dtype, reduction, reduction_fn, context); + + if (host_in_buf != in_buf) { + CCL_FREE(host_in_buf); + } + + if (host_inout_buf != inout_buf) { + q->memcpy(inout_buf, host_inout_buf, bytes).wait(); + CCL_FREE(host_inout_buf); + } + + return ccl::status::success; + +#else /* CCL_ENABLE_SYCL */ + return ccl_comp_reduce_regular( + in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context); +#endif /* CCL_ENABLE_SYCL */ +} + ccl::status ccl_comp_batch_reduce(const void* in_buf, const std::vector<size_t>& offsets, size_t in_count, @@ -118,28 +193,28 @@ ccl::status ccl_comp_batch_reduce(const void* in_buf, for (size_t i = 1; i < offsets.size(); i++) { ccl_convert_bf16_to_fp32_arrays( (char*)in_buf + dtype.size() * offsets[i], tmp, in_count); - ccl_comp_reduce(tmp, - in_count, - acc, - out_count, - ccl::global_data::get().dtypes->get(ccl::datatype::float32), - reduction, - reduction_fn, - context); + ccl_comp_reduce_regular(tmp, + in_count, + acc, + out_count, + ccl::global_data::get().dtypes->get(ccl::datatype::float32), + reduction, + reduction_fn, + context); } ccl_convert_fp32_to_bf16_arrays(acc, inout_buf, in_count); } else { for (size_t i = 1; i < offsets.size(); i++) { - ccl_comp_reduce((char*)in_buf + dtype.size() * offsets[i], - in_count, - inout_buf, - out_count, - dtype, - reduction, - reduction_fn, - context); + ccl_comp_reduce_regular((char*)in_buf + dtype.size() * offsets[i], + in_count, + inout_buf, + out_count, + dtype, + reduction, + reduction_fn, + context); } } diff --git a/src/comp/comp.hpp b/src/comp/comp.hpp index 10170717d..fd9124d51 100644 --- a/src/comp/comp.hpp +++ b/src/comp/comp.hpp @@ -16,14 +16,16 @@ #pragma once #include "common/datatype/datatype.hpp" -#include "oneapi/ccl/types.hpp" #include "internal_types.hpp" +#include "oneapi/ccl/types.hpp" +#include "sched/sched.hpp" ccl::status ccl_comp_copy(const void* in_buf, void* out_buf, size_t count, const ccl_datatype& dtype); -ccl::status ccl_comp_reduce(const void* in_buf, +ccl::status ccl_comp_reduce(ccl_sched* sched, + const void* in_buf, size_t in_count, void* inout_buf, size_t* out_count, diff --git a/src/comp/fp16/fp16.cpp b/src/comp/fp16/fp16.cpp index 74ccf0cea..f9ac7641f 100644 --- a/src/comp/fp16/fp16.cpp +++ b/src/comp/fp16/fp16.cpp @@ -65,7 +65,15 @@ void ccl_fp16_reduce(const void* in_buf, void* inout_buf, size_t* out_cnt, ccl::reduction op) { - CCL_FATAL("FP16 reduction is requested but CCL was compiled w/o FP16 support"); + CCL_FATAL("FP16 reduction was requested but CCL was compiled w/o FP16 support"); +} + +void ccl_convert_fp32_to_fp16(const void* src, void* dst) { + CCL_FATAL("FP32->FP16 conversion was requested but CCL was compiled w/o FP16 support"); +} + +void ccl_convert_fp16_to_fp32(const void* src, void* dst) { + CCL_FATAL("FP16->FP32 conversion was requested but CCL was compiled w/o FP16 support"); } #endif /* CCL_FP16_COMPILER */ diff --git a/src/comp/fp16/fp16.hpp b/src/comp/fp16/fp16.hpp index e5cd37d9b..62e66ac3d 100644 --- a/src/comp/fp16/fp16.hpp +++ b/src/comp/fp16/fp16.hpp @@ -18,22 +18,19 @@ #include "oneapi/ccl/types.hpp" #ifdef CCL_FP16_TARGET_ATTRIBUTES -__attribute__((target("avx512f,f16c"))) void ccl_fp16_reduce(const void* in_buf, - size_t in_cnt, - void* inout_buf, - size_t* out_cnt, - ccl::reduction reduction_op); +__attribute__((target("avx512bw,avx512vl,f16c"))) void ccl_fp16_reduce(const void* in_buf, + size_t in_cnt, + void* inout_buf, + size_t* out_cnt, + ccl::reduction reduction_op); +__attribute__((target("f16c"))) void ccl_convert_fp32_to_fp16(const void* src, void* dst); +__attribute__((target("f16c"))) void ccl_convert_fp16_to_fp32(const void* src, void* dst); #else /* CCL_FP16_TARGET_ATTRIBUTES */ void ccl_fp16_reduce(const void* in_buf, size_t in_cnt, void* inout_buf, size_t* out_cnt, ccl::reduction reduction_op); +void ccl_convert_fp32_to_fp16(const void* src, void* dst); +void ccl_convert_fp16_to_fp32(const void* src, void* dst); #endif /* CCL_FP16_TARGET_ATTRIBUTES */ - -#ifdef CCL_FP16_COMPILER -#ifdef CCL_FP16_TARGET_ATTRIBUTES -void ccl_convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c"))); -void ccl_convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c"))); -#endif /* CCL_FP16_TARGET_ATTRIBUTES */ -#endif /* CCL_FP16_COMPILER */ diff --git a/src/comp/fp16/fp16_intrisics.hpp b/src/comp/fp16/fp16_intrisics.hpp index 3b6ad8faf..4b2f4b28e 100644 --- a/src/comp/fp16/fp16_intrisics.hpp +++ b/src/comp/fp16/fp16_intrisics.hpp @@ -19,6 +19,7 @@ #include <immintrin.h> #include <inttypes.h> +#include <string.h> #include "common/global/global.hpp" #include "comp/fp16/fp16_utils.hpp" @@ -28,7 +29,7 @@ #define CCL_FP16_STEP_256 8 #ifdef CCL_FP16_TARGET_ATTRIBUTES -#define FP16_ALL_ATTRS "f16c,avx512f" +#define FP16_ALL_ATTRS "f16c,avx512f,avx512bw,avx512vl" #define FP16_TARGET_ATTRIBUTE_F16C __attribute__((target("f16c"))) #define FP16_TARGET_ATTRIBUTE_AVX512 __attribute__((target("avx512f"))) #define FP16_TARGET_ATTRIBUTE_ALL __attribute__((target(FP16_ALL_ATTRS))) @@ -91,6 +92,21 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_256( _mm_storeu_si128((__m128i*)(res), _mm256_cvtps_ph(vfp32_out, 0)); } +FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_256(const void* in, + void* inout, + uint8_t len, + ccl_fp16_reduction_func_ptr_256 op) { + if (len == 0) + return; + uint16_t a[CCL_FP16_STEP_256]; + uint16_t b[CCL_FP16_STEP_256]; + uint16_t res[CCL_FP16_STEP_256]; + memcpy(a, in, len * sizeof(uint16_t)); + memcpy(b, inout, len * sizeof(uint16_t)); + ccl_fp16_reduce_inputs_256(a, b, res, op); + memcpy(inout, res, len * sizeof(uint16_t)); +} + FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_512( const void* a, const void* b, @@ -103,24 +119,26 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_512( _mm256_storeu_si256((__m256i*)(res), _mm512_cvtps_ph(vfp32_out, 0)); } +FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_512(const void* in, + void* inout, + uint8_t len, + ccl_fp16_reduction_func_ptr_512 op) { + if (len == 0) + return; + uint16_t mask = ((uint16_t)0xFFFF) >> (CCL_FP16_STEP_512 - len); + __m256i a = _mm256_maskz_loadu_epi16(mask, in); + __m256i b = _mm256_maskz_loadu_epi16(mask, inout); + __m256i res; + ccl_fp16_reduce_inputs_512(&a, &b, &res, op); + _mm256_mask_storeu_epi16(inout, (__mmask16)mask, res); +} + #define CCL_FP16_DEFINE_REDUCE_FUNC(VLEN) \ \ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_main_##VLEN( \ const void* in, const void* inout, ccl_fp16_reduction_func_ptr_##VLEN op) { \ ccl_fp16_reduce_inputs_##VLEN(in, inout, (void*)inout, op); \ } \ -\ - FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_##VLEN( \ - const void* in, void* inout, uint8_t len, ccl_fp16_reduction_func_ptr_##VLEN op) { \ - if (len == 0) \ - return; \ - uint16_t fp16_res[CCL_FP16_STEP_##VLEN]; \ - ccl_fp16_reduce_inputs_##VLEN(in, inout, fp16_res, op); \ - uint16_t* inout_ptr = (uint16_t*)inout; \ - for (int i = 0; i < len; i++) { \ - inout_ptr[i] = fp16_res[i]; \ - } \ - } \ \ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_impl_##VLEN( \ const void* in_buf, \ diff --git a/src/comp/fp16/fp16_utils.hpp b/src/comp/fp16/fp16_utils.hpp index d24270fd7..315060c80 100644 --- a/src/comp/fp16/fp16_utils.hpp +++ b/src/comp/fp16/fp16_utils.hpp @@ -48,10 +48,13 @@ __attribute__((__always_inline__)) inline std::set<ccl_fp16_impl_type> ccl_fp16_ /* AVX512 capabilities for FP16 implementation */ /* CPUID.(EAX=07H, ECX=0):EBX.AVX512F [bit 16] */ + /* CPUID.(EAX=07H, ECX=0):EBX.AVX512BW [bit 30] */ + /* CPUID.(EAX=07H, ECX=0):EBX.AVX512VL [bit 31] */ __asm__ __volatile__("cpuid" : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "a"(7), "c"(0)); - is_avx512f_enabled = ((reg[1] & (1 << 16)) >> 16); + is_avx512f_enabled = + ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31); if (is_avx512f_enabled) result.insert(ccl_fp16_avx512f); diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp index e48540ebb..2d86bfe6e 100644 --- a/src/exec/exec.cpp +++ b/src/exec/exec.cpp @@ -42,18 +42,22 @@ size_t ccl_executor::calculate_atl_ep_count(size_t worker_count) { atl_attr_t ccl_executor::generate_atl_attr(const ccl::env_data& env) { atl_attr_t attr; - - attr.ep_count = calculate_atl_ep_count(env.worker_count); - attr.enable_shm = env.enable_shm; + attr.in.enable_shm = env.enable_shm; /* TODO: executor may be destroyed before cached rma-based schedule made memory deregistration need to refactor global objects dependencies don't use ring_rma till that */ - attr.enable_rma = 0; // env.enable_rma; - attr.sync_coll = env.sync_coll; - attr.extra_ep = env.extra_ep; + attr.in.enable_rma = 0; // env.enable_rma; + attr.in.enable_device_buf = env.enable_device_buf; + attr.in.enable_sync_coll = env.enable_sync_coll; + attr.in.enable_extra_ep = env.enable_extra_ep; + attr.in.ep_count = calculate_atl_ep_count(env.worker_count); + attr.in.mnic_type = env.mnic_type; + attr.in.mnic_count = env.mnic_count; + + memset(&attr.out, 0, sizeof(attr.out)); return attr; } @@ -147,9 +151,13 @@ ccl_executor::~ccl_executor() { } else LOG_DEBUG("stopped worker # ", idx); + } - workers[idx].reset(); + while (!workers[idx]->can_reset()) { + ccl_yield(ccl::global_data::env().yield_type); } + + workers[idx].reset(); } } diff --git a/src/exec/thread/base_thread.hpp b/src/exec/thread/base_thread.hpp index 097ee8a22..85e4c7f65 100644 --- a/src/exec/thread/base_thread.hpp +++ b/src/exec/thread/base_thread.hpp @@ -47,6 +47,10 @@ class ccl_base_thread { ccl::status start(int affinity); ccl::status stop(); + virtual bool can_reset() { + return true; + } + size_t get_idx() { return idx; } diff --git a/src/exec/thread/service_worker.cpp b/src/exec/thread/service_worker.cpp index 5e35c9bd3..4d6aae70d 100644 --- a/src/exec/thread/service_worker.cpp +++ b/src/exec/thread/service_worker.cpp @@ -21,7 +21,21 @@ ccl_service_worker::ccl_service_worker(size_t idx, : ccl_worker(idx, std::move(data_queue)), fusion_manager(fusion_manager) {} +ccl_service_worker::~ccl_service_worker() { + fusion_manager.reset(); +} + ccl::status ccl_service_worker::do_work(size_t& processed_count) { fusion_manager.execute(); return ccl_worker::do_work(processed_count); } + +bool ccl_service_worker::can_reset() { + /* skip ATL processing since it may be already destroyed */ + /* make only local processing */ + process_atl = false; + + size_t processed_count; + do_work(processed_count); + return fusion_manager.can_reset(); +} diff --git a/src/exec/thread/service_worker.hpp b/src/exec/thread/service_worker.hpp index 3006e470c..a9267df29 100644 --- a/src/exec/thread/service_worker.hpp +++ b/src/exec/thread/service_worker.hpp @@ -24,9 +24,11 @@ class ccl_service_worker : public ccl_worker { ccl_service_worker(size_t idx, std::unique_ptr<ccl_sched_queue> data_queue, ccl_fusion_manager& fusion_manager); - ~ccl_service_worker() = default; + ~ccl_service_worker(); - ccl::status do_work(size_t& processed_count); + ccl::status do_work(size_t& processed_count) override; + + bool can_reset() override; private: ccl_fusion_manager& fusion_manager; diff --git a/src/exec/thread/worker.cpp b/src/exec/thread/worker.cpp index c90ef9aaa..25eea9ea1 100644 --- a/src/exec/thread/worker.cpp +++ b/src/exec/thread/worker.cpp @@ -28,6 +28,7 @@ ccl_worker::ccl_worker(size_t idx, std::unique_ptr<ccl_sched_queue> queue) : ccl_base_thread(idx, ccl_worker_func), should_lock(false), is_locked(false), + process_atl(true), strict_sched_queue(std::unique_ptr<ccl_strict_sched_queue>(new ccl_strict_sched_queue())), sched_queue(std::move(queue)) {} @@ -167,11 +168,13 @@ ccl::status ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed_ /* ensure communication progress */ - for (size_t sched_idx = 0; sched_idx < 1 /*bin_size*/; sched_idx++) { - ccl_sched* sched = bin->get(sched_idx); - ccl_comm* comm = sched->coll_param.comm; - atl_status_t atl_status = comm->atl->atl_ep_poll(bin->get_atl_ep()); - CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status); + if (process_atl) { + for (size_t sched_idx = 0; sched_idx < 1; sched_idx++) { + ccl_sched* sched = bin->get(sched_idx); + ccl_comm* comm = sched->coll_param.comm; + atl_status_t atl_status = comm->atl->atl_ep_poll(bin->get_atl_ep()); + CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status); + } } // if (ccl::global_data::get().is_ft_enabled) { diff --git a/src/exec/thread/worker.hpp b/src/exec/thread/worker.hpp index 79c96532d..168f691ae 100644 --- a/src/exec/thread/worker.hpp +++ b/src/exec/thread/worker.hpp @@ -55,6 +55,7 @@ class ccl_worker : public ccl_base_thread { std::atomic<bool> should_lock; std::atomic<bool> is_locked; + bool process_atl; void update_wait_condition(ccl_base_thread::wait_data::update_type type, size_t delta); diff --git a/src/fusion/fusion.cpp b/src/fusion/fusion.cpp index 669b24085..7385f9982 100644 --- a/src/fusion/fusion.cpp +++ b/src/fusion/fusion.cpp @@ -123,8 +123,7 @@ ccl_fusion_manager::~ccl_fusion_manager() { ", overlapped_exec_calls ", stat_overlapped_exec_calls); - while (!tracked_scheds.empty()) - check_tracked_scheds(true); + reset(); CCL_ASSERT(postponed_queue.empty() && exec_queue.empty() && tracked_scheds.empty(), "queues are not empty, ", @@ -135,8 +134,24 @@ ccl_fusion_manager::~ccl_fusion_manager() { tracked_scheds.size()); } +bool ccl_fusion_manager::can_reset() { + check_tracked_scheds(true); + return tracked_scheds.empty(); +} + +void ccl_fusion_manager::reset() { + while (tracked_scheds.size()) + check_tracked_scheds(true); +} + bool ccl_fusion_manager::can_fuse(ccl_master_sched* sched) { + if (atl_wrapper::attr.out.enable_device_buf) { + /* TODO: implement fusion with D2D copies */ + return false; + } + size_t bytes = sched->coll_param.count * sched->coll_param.dtype.size(); + if (bytes >= bytes_threshold) { LOG_DEBUG("can't fuse due to size ", bytes, ", max ", bytes_threshold); return false; @@ -147,6 +162,11 @@ bool ccl_fusion_manager::can_fuse(ccl_master_sched* sched) { return false; } + if (sched->coll_param.deps.size()) { + LOG_DEBUG("can't fuse due to deps size ", sched->coll_param.deps.size()); + return false; + } + if (sched->coll_attr.prologue_fn || sched->coll_attr.epilogue_fn || sched->coll_attr.reduction_fn || sched->coll_attr.synchronous) { LOG_DEBUG("can't fuse due to unexpected fields in coll_attr"); @@ -165,8 +185,11 @@ bool ccl_fusion_manager::add(ccl_master_sched* sched) { CCL_THROW_IF_NOT(sched->is_completed(), "incorrect completion counter"); sched->set_counter(1); - std::lock_guard<ccl_fusion_lock_t> lock{ guard }; - postponed_queue.push_back(sched); + { + std::lock_guard<ccl_fusion_lock_t> lock{ guard }; + postponed_queue.push_back(sched); + } + return true; } @@ -223,6 +246,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { coll_param.dtype = dtype; coll_param.reduction = reduction; coll_param.comm = comm; + coll_param.stream = nullptr; sched = new ccl_master_sched(coll_param); sched->internal_type = ccl_sched_internal_fusion; } break; @@ -269,7 +293,11 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { CCL_THROW_IF_NOT(sched); - tracked_scheds.push_back(sched); + { + std::lock_guard<ccl_fusion_lock_t> lock{ guard }; + tracked_scheds.push_back(sched); + } + sched->coll_attr.priority = max_priority; sched->coll_attr.to_cache = use_cache; @@ -312,9 +340,10 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { size_t global_copy_idx = idx * copies_per_part + copy_idx; #ifdef CCL_ENABLE_SYCL if (stream && stream->is_sycl_device_stream()) - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( + entry_factory::make_entry<sycl_copy_entry>( part_scheds[idx].get(), - ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_send_buf), + copy_direction::d2h, + ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.device_send_buf), exec_queue[global_copy_idx]->coll_param.count * dtype_size, ccl_buffer_type::INDIRECT), ccl_buffer(fusion_buf, buf_cache.get_buf_size(), offset), @@ -349,10 +378,11 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { size_t global_copy_idx = idx * copies_per_part + copy_idx; #ifdef CCL_ENABLE_SYCL if (stream && stream->is_sycl_device_stream()) - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( + entry_factory::make_entry<sycl_copy_entry>( part_scheds[idx].get(), + copy_direction::h2d, ccl_buffer(fusion_buf, buf_cache.get_buf_size(), offset), - ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_recv_buf), + ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.device_recv_buf), exec_queue[global_copy_idx]->coll_param.count * dtype_size, ccl_buffer_type::INDIRECT), exec_queue[global_copy_idx]->coll_param.count, @@ -369,6 +399,8 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { exec_queue[global_copy_idx]->coll_param.count, dtype); + part_scheds[idx]->add_barrier(); + offset += exec_queue[global_copy_idx]->coll_param.count * dtype_size; entry_factory::make_entry<function_entry>( part_scheds[idx].get(), complete_user_request, exec_queue[global_copy_idx]); @@ -377,11 +409,12 @@ ccl_master_sched* ccl_fusion_manager::build_sched() { } } + sched->sync_partial_scheds(); + if (use_cache) { part_scheds[0]->set_finalize_fn(release_fusion_buf_for_cached_sched, fusion_buf); } else { - sched->sync_partial_scheds(); entry_factory::make_entry<function_entry>( part_scheds[0].get(), release_fusion_buf, fusion_buf); } @@ -412,6 +445,7 @@ void ccl_fusion_manager::execute() { } } } + /* separate block to reduce lock scope */ { std::lock_guard<ccl_fusion_lock_t> lock{ guard }; @@ -490,6 +524,7 @@ void ccl_fusion_manager::clear_exec_queue() { } void ccl_fusion_manager::check_tracked_scheds(bool force_release) { + std::lock_guard<ccl_fusion_lock_t> lock{ guard }; for (auto it = tracked_scheds.begin(); it != tracked_scheds.end();) { ccl_master_sched* sched = *it; if (sched->is_completed() && (!sched->coll_attr.to_cache || force_release)) { diff --git a/src/fusion/fusion.hpp b/src/fusion/fusion.hpp index ed5dccb24..ff102ee5b 100644 --- a/src/fusion/fusion.hpp +++ b/src/fusion/fusion.hpp @@ -60,6 +60,8 @@ class ccl_fusion_manager { ccl_fusion_manager(const ccl_fusion_manager& other) = delete; ccl_fusion_manager& operator=(const ccl_fusion_manager& other) = delete; + void reset(); + bool can_reset(); bool can_fuse(ccl_master_sched* sched); bool add(ccl_master_sched* sched); void execute(); diff --git a/src/hwloc/hwloc_wrapper.c b/src/hwloc/hwloc_wrapper.c new file mode 100644 index 000000000..5b0601743 --- /dev/null +++ b/src/hwloc/hwloc_wrapper.c @@ -0,0 +1,93 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "hwloc_wrapper.h" + +static hwloc_info_t hwloc_info = { .initialized = 0 }; + +hwloc_status_t hwloc_init() { + hwloc_status_t ret = HWLOC_SUCCESS; + + hwloc_info.initialized = 0; + hwloc_info.bindset = hwloc_bitmap_alloc(); + + if (hwloc_topology_init(&hwloc_info.topology) < 0) { + printf("hwloc_topology_init failed (%s)\n", strerror(errno)); + goto err; + } + + hwloc_topology_set_io_types_filter(hwloc_info.topology, HWLOC_TYPE_FILTER_KEEP_ALL); + + if (hwloc_topology_load(hwloc_info.topology) < 0) { + printf("hwloc_topology_load failed (%s)\n", strerror(errno)); + goto err; + } + + if (hwloc_get_proc_cpubind( + hwloc_info.topology, getpid(), hwloc_info.bindset, HWLOC_CPUBIND_PROCESS) < 0) { + printf("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); + goto err; + } + + hwloc_info.initialized = 1; + + return ret; + +err: + return HWLOC_FAILURE; +} + +hwloc_status_t hwloc_finalize() { + hwloc_status_t ret = HWLOC_SUCCESS; + + hwloc_topology_destroy(hwloc_info.topology); + hwloc_bitmap_free(hwloc_info.bindset); + hwloc_info.initialized = 0; + + return ret; +} + +int hwloc_is_initialized() { + return hwloc_info.initialized; +} + +static hwloc_obj_t hwloc_get_first_non_io_obj_by_pci(int domain, int bus, int dev, int func) { + hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(hwloc_info.topology, domain, bus, dev, func); + HWLOC_ASSERT(io_device, + "failed to get PCI device with domain %d, bus %d, dev %d, func %d", + domain, + bus, + dev, + func); + hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(hwloc_info.topology, io_device); + HWLOC_ASSERT(first_non_io, "failed to get ancestor of PCI device"); + return first_non_io; +} + +int hwloc_is_dev_close_by_pci(int domain, int bus, int dev, int func) { + int is_close = 0; + + if (!hwloc_is_initialized()) + return is_close; + + hwloc_obj_t first_non_io = hwloc_get_first_non_io_obj_by_pci(domain, bus, dev, func); + + /* determine if PCI device is "close" to process by checking if process's affinity is included + * in PCI device's affinity or if PCI device's affinity is included in process's affinity */ + is_close = (hwloc_bitmap_isincluded(hwloc_info.bindset, first_non_io->cpuset) || + hwloc_bitmap_isincluded(first_non_io->cpuset, hwloc_info.bindset)); + + return is_close; +} diff --git a/src/hwloc/hwloc_wrapper.h b/src/hwloc/hwloc_wrapper.h new file mode 100644 index 000000000..7b7ff7b9d --- /dev/null +++ b/src/hwloc/hwloc_wrapper.h @@ -0,0 +1,73 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef HWLOC_WRAPPER_H +#define HWLOC_WRAPPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "hwloc.h" +#include <sys/syscall.h> + +#define GETTID() syscall(SYS_gettid) + +#define HWLOC_ASSERT(cond, fmt, ...) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, \ + "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \ + GETTID(), \ + __FILE__, \ + __FUNCTION__, \ + __LINE__, \ + #cond, \ + ##__VA_ARGS__); \ + fflush(stderr); \ + } \ + } while (0) + +typedef enum { HWLOC_SUCCESS, HWLOC_FAILURE, HWLOC_UNSUPPORTED } hwloc_status_t; + +inline const char* hwloc_status_to_str(hwloc_status_t status) { + switch (status) { + case HWLOC_SUCCESS: return "SUCCESS"; + case HWLOC_FAILURE: return "FAILURE"; + case HWLOC_UNSUPPORTED: return "UNSUPPORTED"; + default: return "UNKNOWN"; + } +} + +typedef struct { + hwloc_topology_t topology; + hwloc_cpuset_t bindset; + int initialized; +} hwloc_info_t; + +hwloc_status_t hwloc_init(); +hwloc_status_t hwloc_finalize(); +int hwloc_is_initialized(); + +/* + * return true if pci device is close to this process + */ +int hwloc_is_dev_close_by_pci(int domain, int bus, int dev, int func); + +#ifdef __cplusplus +} +#endif + +#endif /* HWLOC_WRAPPER_H */ diff --git a/src/kernels/a2a_helpers.h b/src/kernels/a2a_helpers.h new file mode 100644 index 000000000..10c44e398 --- /dev/null +++ b/src/kernels/a2a_helpers.h @@ -0,0 +1,38 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include "common.h" + +#define DEFINE_A2A_COMM_DATA(NAME, T) \ + typedef struct __attribute__((packed)) a2a_gpu_comm_data_##NAME { \ + __global T* recv_buf; \ + __global sync_flag_type* ready_to_receive_flag; \ + __global sync_flag_type* data_sent_flag; \ + } a2a_gpu_comm_data_##NAME; + +DEFINE_A2A_COMM_DATA(int8, int8_t) +DEFINE_A2A_COMM_DATA(uint8, uint8_t) +DEFINE_A2A_COMM_DATA(int16, int16_t) +DEFINE_A2A_COMM_DATA(uint16, uint16_t) +DEFINE_A2A_COMM_DATA(int32, int32_t) +DEFINE_A2A_COMM_DATA(uint32, uint32_t) +DEFINE_A2A_COMM_DATA(int64, int64_t) +DEFINE_A2A_COMM_DATA(uint64, uint64_t) +//DEFINE_A2A_COMM_DATA(float16, half) +DEFINE_A2A_COMM_DATA(float32, float) +DEFINE_A2A_COMM_DATA(float64, double) +DEFINE_A2A_COMM_DATA(bfloat16, uint16_t) diff --git a/src/kernels/common.h b/src/kernels/common.h new file mode 100644 index 000000000..493e2c5c1 --- /dev/null +++ b/src/kernels/common.h @@ -0,0 +1,287 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#ifdef HOST_CTX + +#define __global +using namespace ccl; +#include <cstdint> + +#ifdef ENABLE_KERNEL_ATOMICS +// type for sync flags for atomics support +typedef atomic_int sync_flag_type; +#else +// default type for sync flags +typedef volatile int sync_flag_type; +#endif /* ENABLE_KERNEL_ATOMICS */ + +#else /* HOST_CTX */ + +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#include "lp.h" + +#define FORMAT_int8_t "%hhd" +#define FORMAT_int16_t "%d" +#define FORMAT_int32_t "%d" +#define FORMAT_int64_t "%ld" + +#define FORMAT_uint8_t "%hhu" +#define FORMAT_uint16_t "%u" +#define FORMAT_uint32_t "%u" +#define FORMAT_uint64_t "%lu" + +#define FORMAT_float "%f" +#define FORMAT_double "%f" + +#define FORMAT_ushort "%u" +#define FORMAT_half "%f" + +#define FORMAT_4(format) #format ", " #format ", " #format ", " #format +#define FORMAT_char4 FORMAT_4(% hhd) +#define FORMAT_uchar4 FORMAT_4(% hhu) +#define FORMAT_short4 FORMAT_4(% d) +#define FORMAT_ushort4 FORMAT_4(% u) +#define FORMAT_int4 FORMAT_4(% d) +#define FORMAT_uint4 FORMAT_4(% u) +#define FORMAT_long4 FORMAT_4(% ld) +#define FORMAT_ulong4 FORMAT_4(% lu) +#define FORMAT_float4 FORMAT_4(% f) +#define FORMAT_double4 FORMAT_4(% f) + +#define ELEMENTS_1(X) X +#define ELEMENTS_4(X) (X)[0], (X)[1], (X)[2], (X)[3] + +// define aliases for OpenCL types +typedef char int8_t; +typedef uchar uint8_t; +typedef short int16_t; +typedef ushort uint16_t; +typedef int int32_t; +typedef uint uint32_t; +typedef long int64_t; +typedef ulong uint64_t; +typedef half float16_t; +typedef float float32_t; +typedef double float64_t; +typedef ushort bfloat16; + +#define DEFINE_SUM_OP(T) \ + T __sum_##T(T lhs, T rhs) { \ + return lhs + rhs; \ + } + +#define DEFINE_PROD_OP(T) \ + T __prod_##T(T lhs, T rhs) { \ + return lhs * rhs; \ + } + +#define DEFINE_MIN_OP(T) \ + T __min_##T(T lhs, T rhs) { \ + return min(lhs, rhs); \ + } + +#define DEFINE_MAX_OP(T) \ + T __max_##T(T lhs, T rhs) { \ + return max(lhs, rhs); \ + } + +#ifdef ENABLE_KERNEL_DEBUG +#define DEBUG_BLOCK(block) block +#else +#define DEBUG_BLOCK(block) +#endif + +#ifdef ENABLE_KERNEL_DEBUG +#define LOG_INPUT_DATA_START(rank) printf("kernel %d, wait income data\n", rank) +#define LOG_INPUT_DATA_END(rank) printf("kernel %d, received data\n", rank) +#define LOG_OUTGOING_DATA_START(rank) printf("kernel %d, wait signal to send\n", rank) +#define LOG_OUTGOING_DATA_END(rank) printf("kernel %d, received signal to send\n", rank) +#define LOG_SEND_PROGRESS(rank, thread_id, flag, desired) \ + printf("kernel %d.%d, send %d/%d\n", rank, thread_id, flag, desired) +#define LOG_BARRIER_PASSED(rank, thread_id) \ + printf("kernel %d.%d barrier passed\n", rank, thread_id); +#define LOG_IN_BARRIER(rank, thread_id, flag, desired) \ + printf("kernel %d.%d barrier %d/%d\n", rank, thread_id, flag, desired); +#else /* ENABLE_KERNEL_DEBUG */ +#define LOG_INPUT_DATA_START(rank) +#define LOG_INPUT_DATA_END(rank) +#define LOG_OUTGOING_DATA_START(rank) +#define LOG_OUTGOING_DATA_END(rank) +#define LOG_BARRIER_PASSED(rank, thread_id) +#define LOG_IN_BARRIER(rank, thread_id, flag, desired) +#endif /* ENABLE_KERNEL_DEBUG */ + +#define SWAP_VARIABLES(var1, var2, type) \ + do { \ + type tmp; \ + tmp = var1; \ + var1 = var2; \ + var2 = tmp; \ + } while (0); + +int get_left_rank(int rank, int comm_size) { + return rank == 0 ? comm_size - 1 : rank - 1; +} + +int get_right_rank(int rank, int comm_size) { + return rank == (comm_size - 1) ? 0 : rank + 1; +} + +#ifdef ENABLE_KERNEL_ATOMICS + +// type for sync flags for atomics support +typedef atomic_int sync_flag_type; + +#define PUT_READY_TO_RECEIVE(_sync_flag) \ + if (thread_id == 0) { \ + atomic_fetch_add_explicit( \ + _sync_flag, 1, memory_order_seq_cst, memory_scope_all_svm_devices); \ + } + +#define I_SENT(_sync_flag) \ + if (thread_id == 0) { \ + atomic_fetch_add_explicit( \ + _sync_flag, 1, memory_order_seq_cst, memory_scope_all_svm_devices); \ + } + +#define WAIT_INPUT_DATA(_sync_flag, _desired) \ + if (thread_id == 0) { \ + LOG_INPUT_DATA_START(my_rank); \ + while (1) { \ + int _old_value = atomic_load_explicit( \ + _sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices); \ + if (_old_value == _desired) { \ + LOG_INPUT_DATA_END(my_rank); \ + ++_desired; \ + break; \ + } \ + } \ + } + +#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \ + if (thread_id == 0) { \ + LOG_OUTGOING_DATA_START(my_rank); \ + while (_desired != atomic_load_explicit( \ + _sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices)) { \ + } \ + LOG_OUTGOING_DATA_END(my_rank); \ + ++_desired; \ + } + +#define SET_PROXY_SIZE(_sync_flag, size) \ + if (thread_id == 0) { \ + atomic_store_explicit( \ + _sync_flag, size, memory_order_seq_cst, memory_scope_all_svm_devices); \ + } + +#define GET_PROXY_SIZE(_sync_flag, size) \ + size = atomic_load_explicit(_sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices); + +#else /* ENABLE_KERNEL_ATOMICS */ + +// default type for sync flags +typedef volatile int sync_flag_type; + +#define PUT_READY_TO_RECEIVE(_sync_flag) \ + if (thread_id == 0) { \ + (*_sync_flag)++; \ + } + +#define I_SENT(_sync_flag) \ + if (thread_id == 0) { \ + (*_sync_flag)++; \ + } + +#define WAIT_INPUT_DATA(_sync_flag, _desired) \ + if (thread_id == 0) { \ + LOG_INPUT_DATA_START(my_rank); \ + while (1) { \ + if (*_sync_flag == _desired) { \ + LOG_INPUT_DATA_END(my_rank); \ + ++_desired; \ + break; \ + } \ + } \ + } + +#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \ + if (thread_id == 0) { \ + LOG_OUTGOING_DATA_START(my_rank); \ + while (_desired != *_sync_flag) { \ + }; \ + LOG_OUTGOING_DATA_END(my_rank); \ + ++_desired; \ + } + +#define SET_PROXY_SIZE(_sync_flag, size) \ + if (thread_id == 0) { \ + *_sync_flag = size; \ + } + +#define GET_PROXY_SIZE(_sync_flag, size) size = *_sync_flag; + +#endif /* ENABLE_KERNEL_ATOMICS */ + +/* +#define KERNEL_BARRIER(_barrier_flag, _desired, _increment) \ + do \ + { \ + int _barrier_value = atomic_add(_barrier_flag, 0); \ + atomic_inc(_barrier_flag); \ + int _old_value = _barrier_value; \ + while(1) \ + { \ + / *thread that last reached the barrier will reset it \ + other threads may expect to receive _desired value while it can be 0 \ + check if received value is less than initially received* / \ + if(_old_value == _desired || _old_value < _barrier_value) \ + { \ + BARRIER_PASSED(my_rank, thread_id); \ + break; \ + } \ + IN_BARRIER(my_rank, thread_id, _old_value, _desired); \ + _old_value = atomic_add(_barrier_flag, 0); \ + } \ + } while (0); +*/ + +/* for A2A */ +/*#define WAIT_INPUT_DATA(_sync_flag, _desired) \ + if (local_thread_id == 0) { \ + LOG_INPUT_DATA_START(rank_id); \ + while (1) { \ + int _old_value = atomic_cmpxchg(_sync_flag, _desired, _desired); \ + if (_old_value == _desired) { \ + LOG_INPUT_DATA_END(rank_id); \ + _desired += 1 + comm_size; \ + break; \ + } \ + } \ + } + +#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \ + if (local_thread_id == 0) { \ + LOG_OUTGOING_DATA_START(rank_id); \ + while (_desired != atomic_cmpxchg(_sync_flag, _desired, _desired)) { \ + }; \ + LOG_OUTGOING_DATA_END(rank_id); \ + _desired += comm_size; \ + }*/ + +#endif /* HOST_CTX */ diff --git a/src/kernels/event_declaration.h b/src/kernels/event_declaration.h new file mode 100644 index 000000000..b468d8a76 --- /dev/null +++ b/src/kernels/event_declaration.h @@ -0,0 +1,41 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifdef HOST_CTX +#define __global + +#include <memory> +using namespace ccl; + +template <class native_type> +struct shared_event_traits {}; + +#else +typedef ushort bfloat16; +#endif + +typedef struct __attribute__((packed)) shared_event_float { + __global int* produced_bytes; + __global float* mem_chunk; +} shared_event_float; + +#ifdef HOST_CTX + +template <> +struct shared_event_traits<float> { + using impl_t = shared_event_float; +}; + +#endif diff --git a/src/kernels/lp.h b/src/kernels/lp.h new file mode 100644 index 000000000..e28ea1d13 --- /dev/null +++ b/src/kernels/lp.h @@ -0,0 +1,162 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef CCL_BF16_GPU_TRUNCATE +float __bf16_to_fp32(ushort V) { + uint temp = convert_uint(V) << 16; + return as_float(temp); +} + +ushort __fp32_to_bf16(float V) { + ushort2 temp = as_ushort2(V); + return temp.s1; +} +#else /* CCL_BF16_GPU_TRUNCATE */ +#include "rne.h" +#endif /* CCL_BF16_GPU_TRUNCATE */ + +#define DEFINE_BF16SUM_OP(T) \ + T __bf16_sum_##T(T lhs, T rhs) { \ + return __fp32_to_bf16(__bf16_to_fp32(lhs) + __bf16_to_fp32(rhs)); \ + } + +#define DEFINE_BF16PROD_OP(T) \ + T __bf16_prod_##T(T lhs, T rhs) { \ + return __fp32_to_bf16(__bf16_to_fp32(lhs) * __bf16_to_fp32(rhs)); \ + } + +#define DEFINE_BF16MIN_OP(T) \ + T __bf16_min_##T(T lhs, T rhs) { \ + return __fp32_to_bf16(min(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \ + } + +#define DEFINE_BF16MAX_OP(T) \ + T __bf16_max_##T(T lhs, T rhs) { \ + return __fp32_to_bf16(max(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \ + } + +#ifdef CCL_FP16_GPU_TRUNCATE +/* +Truncation routines for converting fp32 <-> fp16 + +fp16 has 1 sign bit, 5 exponent bits and 10 significand bits with exponent +offset 15 - https://en.wikipedia.org/wiki/Half-precision_floating-point_format + +For fp16 -> fp32 + +The sign & significand bits are unchanged, but the exponent must be properly +re-offset (i.e. convert the fp16 offset -> fp32 offset). Care must also be taken +to saturate the fp32 result if the fp16 result is saturated. Denormals must be +flushed to 0. + +For fp32 -> fp16 + +Similar to fp16 -> fp32 except that the exponent must be checked for saturation +since the range of the exponent is signficantly smaller than that of fp32. +*/ +float __fp16_to_fp32(half V) { + uint ans_bits = 0; + uint exp_bits = as_ushort(V) & 0x7C00; + uint significand_bits = as_ushort(V) & 0x03FF; + if (exp_bits == 0x7C00) { + ans_bits = ((as_ushort(V) & 0x8000) << 16) | 0x7F800000 | (significand_bits << 13); + } + else if (exp_bits == 0x0000) { + if (significand_bits != 0x00000000) { + ans_bits = ((as_ushort(V) & 0x8000) << 16); + } + else { + ans_bits = ((as_ushort(V) & 0x8000) << 16) | (significand_bits << 13); + } + } + else { + ans_bits = ((as_ushort(V) & 0x8000) << 16) | ((exp_bits + 0x1C000) << 13) | + (significand_bits << 13); + } + return as_float(ans_bits); +} + +half __fp32_to_fp16(float V) { + ushort ans; + uint exp_bits = (as_uint(V) & 0x7F800000); + uint significand_bits = (as_uint(V) & 0x007FFFFF); + if (exp_bits == 0x00000000) { + ans = (as_uint(V) & 0x80000000) >> 16; + } + else if (exp_bits == 0x7F800000) { + if (significand_bits != 0) { + ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C01; + } + else { + ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C00; + } + } + else if (exp_bits < 0x38800000) { + ans = 0xFC00; + } + else if (exp_bits > 0x47000000) { + ans = 0x7C00; + } + else { + ans = ((as_uint(V) & 0x80000000) >> 16) | + ((((as_uint(V) & 0x7F800000) >> 23) - 112) << 10) | ((as_uint(V) & 0x007FFFFF) >> 13); + } + return as_half(ans); +} + +#define DEFINE_FP16SUM_OP(T) \ + T __sum_##T(T lhs, T rhs) { \ + return __fp32_to_fp16(__fp16_to_fp32(lhs) + __fp16_to_fp32(rhs)); \ + } + +#define DEFINE_FP16PROD_OP(T) \ + T __prod_##T(T lhs, T rhs) { \ + return __fp32_to_fp16(__fp16_to_fp32(lhs) * __fp16_to_fp32(rhs)); \ + } + +#define DEFINE_FP16MIN_OP(T) \ + T __min_##T(T lhs, T rhs) { \ + return __fp32_to_fp16(min(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \ + } + +#define DEFINE_FP16MAX_OP(T) \ + T __max_##T(T lhs, T rhs) { \ + return __fp32_to_fp16(max(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \ + } +#else /* CCL_FP16_GPU_TRUNCATE */ +#define DEFINE_FP16SUM_OP(T) \ + T __sum_##T(T lhs, T rhs) { \ + return lhs + rhs; \ + } + +#define DEFINE_FP16PROD_OP(T) \ + T __prod_##T(T lhs, T rhs) { \ + return lhs * rhs; \ + } + +#define DEFINE_FP16MIN_OP(T) \ + T __min_##T(T lhs, T rhs) { \ + return min(lhs, rhs); \ + } + +#define DEFINE_FP16MAX_OP(T) \ + T __max_##T(T lhs, T rhs) { \ + return max(lhs, rhs); \ + } +#endif /* CCL_FP16_GPU_TRUNCATE */ diff --git a/src/kernels/rne.h b/src/kernels/rne.h new file mode 100644 index 000000000..47ca9bf78 --- /dev/null +++ b/src/kernels/rne.h @@ -0,0 +1,51 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef RNE_H +#define RNE_H + +// bf <--> float conversion +// bf : no igc type for bf yet. Use short as *opaque* type for it. +// +// float -> bf conversion builtins (rte rounding mode) +short __builtin_IB_ftobf_1(float a) __attribute__((const)); +short2 __builtin_IB_ftobf_2(float2 a) __attribute__((const)); +short4 __builtin_IB_ftobf_4(float4 a) __attribute__((const)); +short8 __builtin_IB_ftobf_8(float8 a) __attribute__((const)); +short16 __builtin_IB_ftobf_16(float16 a) __attribute__((const)); + +// bf -> float conversion builtins (precise conversion) +float __builtin_IB_bftof_1(short a) __attribute__((const)); +float2 __builtin_IB_bftof_2(short2 a) __attribute__((const)); +float4 __builtin_IB_bftof_4(short4 a) __attribute__((const)); +float8 __builtin_IB_bftof_8(short8 a) __attribute__((const)); +float16 __builtin_IB_bftof_16(short16 a) __attribute__((const)); + +// 2 floats --> packed 2 bf (rte rounding mode) +int __builtin_IB_2fto2bf_1(float a, float b) __attribute__((const)); +int2 __builtin_IB_2fto2bf_2(float2 a, float2 b) __attribute__((const)); +int4 __builtin_IB_2fto2bf_4(float4 a, float4 b) __attribute__((const)); +int8 __builtin_IB_2fto2bf_8(float8 a, float8 b) __attribute__((const)); +int16 __builtin_IB_2fto2bf_16(float16 a, float16 b) __attribute__((const)); + +float __bf16_to_fp32(ushort V) { + return __builtin_IB_bftof_1(as_short(V)); +} + +ushort __fp32_to_bf16(float V) { + return as_ushort(__builtin_IB_ftobf_1(V)); +} + +#endif /* RNE_H */ diff --git a/src/kernels/shared.h b/src/kernels/shared.h new file mode 100644 index 000000000..3dce51e12 --- /dev/null +++ b/src/kernels/shared.h @@ -0,0 +1,71 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef SHARED_H +#define SHARED_H + +// Defines values and functions shared on both host and device +// Constants VEC_SIZe is defined as a macro since the value is used by the kernel code at compile-time + +// Allgatherv + +#define RING_ALLGATHERV_VEC_SIZE 1 + +// Allreduce +#define RING_ALLREDUCE_VEC_SIZE 1 + +static inline size_t ring_allreduce_get_segment_size(size_t elems_count, size_t comm_size) { + elems_count /= RING_ALLREDUCE_VEC_SIZE; + return (elems_count + comm_size - 1) / comm_size; +} + +static inline size_t ring_allreduce_get_tmp_buffer_size(size_t elems_count, size_t comm_size) { + // The algorithm uses at most 2 * segment_size elements of tmp_buffer in order to store + // temporal data + return 2 * ring_allreduce_get_segment_size(elems_count, comm_size); +} + +// Bcast + +#define RING_BCAST_VEC_SIZE 1 + +// Reduce + +#define RING_REDUCE_VEC_SIZE 1 + +static inline size_t ring_reduce_tmp_buffer_size(size_t elems_count, size_t comm_size) { + (void)comm_size; + return elems_count; +} + +// Reduce-scatter + +#define RING_REDUCE_SCATTER_VEC_SIZE 1 + +static inline size_t ring_reduce_scatter_get_segment_size(size_t recv_count, size_t comm_size) { + (void) + comm_size; // C disallows unnamed parameters in function signature, so use a nammed one and simply suppress it + // Our segment size siply equal to recv_count parameter with respect to vector size + recv_count /= RING_REDUCE_SCATTER_VEC_SIZE; + return recv_count; +} + +static inline size_t ring_reduce_scatter_tmp_buffer_size(size_t elems_count, size_t comm_size) { + // The algorithm uses at most 2 * segment_size elements of tmp_buffer in order to store + // temporal data + return 2 * ring_reduce_scatter_get_segment_size(elems_count, comm_size); +} + +#endif /* SHARED_H */ diff --git a/src/native_device_api/interop_utils.cpp b/src/native_device_api/interop_utils.cpp index 350494f2a..f76230dee 100644 --- a/src/native_device_api/interop_utils.cpp +++ b/src/native_device_api/interop_utils.cpp @@ -138,10 +138,10 @@ assoc_result check_assoc_device_memory(const void* mem, #ifdef CCL_ENABLE_SYCL - cl::sycl::usm::alloc pointer_type = cl::sycl::get_pointer_type(mem, ctx); + sycl::usm::alloc pointer_type = sycl::get_pointer_type(mem, ctx); using usm_truth_table = - std::array<usm_support_mode, utils::enum_to_underlying(cl::sycl::usm::alloc::unknown) + 1>; + std::array<usm_support_mode, utils::enum_to_underlying(sycl::usm::alloc::unknown) + 1>; constexpr int platform_config_count = 4; /*host, cpu, gpu, accel*/ constexpr std::array<usm_truth_table, platform_config_count> usm_target_table{ { @@ -177,8 +177,8 @@ assoc_result check_assoc_device_memory(const void* mem, if (std::get<assoc_result_index::SUPPORT_MODE>(ret) == usm_support_mode::prohibited) { std::stringstream ss; - ss << "Incompatible USM type requested: " << usm_to_string(pointer_type) - << ", for ccl_device: " << std::to_string(platform_type_index); + ss << "incompatible usm type requested: " << usm_to_string(pointer_type) + << " for device: " << std::to_string(platform_type_index); std::get<assoc_result_index::ERROR_CAUSE>(ret) = ss.str(); } #else diff --git a/src/native_device_api/l0/context.cpp b/src/native_device_api/l0/context.cpp index b0f03d419..d2f38b85f 100644 --- a/src/native_device_api/l0/context.cpp +++ b/src/native_device_api/l0/context.cpp @@ -17,6 +17,7 @@ #include "oneapi/ccl/native_device_api/l0/context.hpp" #include "oneapi/ccl/native_device_api/l0/base_impl.hpp" #include "oneapi/ccl/native_device_api/l0/device.hpp" +#include "oneapi/ccl/native_device_api/l0/event_pool.hpp" #include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp" #include "oneapi/ccl/native_device_api/l0/driver.hpp" #include "oneapi/ccl/native_device_api/l0/platform.hpp" @@ -27,6 +28,8 @@ namespace native { ccl_context::ccl_context(handle_t h, owner_ptr_t&& platform) : base(h, std::move(platform), std::weak_ptr<ccl_context>{}) {} +ccl_context::~ccl_context() {} + CCL_BE_API const ze_host_mem_alloc_desc_t& ccl_context::get_default_host_alloc_desc() { static const ze_host_mem_alloc_desc_t common{ .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, @@ -57,6 +60,52 @@ CCL_BE_API void ccl_context::host_free_memory(void* mem_handle) { } } +CCL_BE_API ccl_context::ccl_event_pool_ptr ccl_context::create_event_pool( + std::initializer_list<ccl_device*> devices, + const ze_event_pool_desc_t& descr) { + if (!pool_holder) { + pool_holder.reset(new ccl_event_pool_holder); + } + + ze_event_pool_handle_t pool = nullptr; + + std::vector<ccl_device::handle_t> device_handles(devices.size()); + for (ccl_device* d : devices) { + device_handles.push_back(d->get()); + } + ze_result_t status = + zeEventPoolCreate(get(), + &descr, + devices.size(), + (device_handles.empty() ? nullptr : device_handles.data()), + &pool); + if (status != ZE_RESULT_SUCCESS) { + CCL_THROW("zeEventPoolCreate, error: " + native::to_string(status)); + } + + std::shared_ptr<ccl_event_pool> pool_ptr = + std::make_shared<ccl_event_pool>(descr, pool, pool_holder, get_ptr()); + return pool_holder->emplace(devices, pool_ptr); +} + +CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> ccl_context::get_shared_event_pool( + std::initializer_list<ccl_device*> devices) { + std::vector<std::shared_ptr<ccl_event_pool>> ret; + if (pool_holder) { + ret = pool_holder->get_event_pool_storage(devices); + } + return ret; +} + +CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> ccl_context::get_shared_event_pool( + std::initializer_list<ccl_device*> devices) const { + std::vector<std::shared_ptr<ccl_event_pool>> ret; + if (pool_holder) { + ret = pool_holder->get_event_pool_storage(devices); + } + return ret; +} + CCL_BE_API std::string ccl_context::to_string() const { std::stringstream ss; ss << handle; diff --git a/src/native_device_api/l0/device.cpp b/src/native_device_api/l0/device.cpp index 93cd6a6b0..201ce63e9 100644 --- a/src/native_device_api/l0/device.cpp +++ b/src/native_device_api/l0/device.cpp @@ -396,11 +396,8 @@ CCL_BE_API void* ccl_device::device_alloc_memory(size_t bytes_count, ctx = get_default_context(); } - ze_result_t - ret = //zeDriverAllocSharedMem(get_owner()->handle, handle, flags, ordinal, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr); - //zeDriverAllocHostMem(get_owner()->handle, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr); - zeMemAllocDevice( - ctx->get(), &mem_descr, /*&host_descr, */ bytes_count, alignment, handle, &out_ptr); + ze_result_t ret = zeMemAllocDevice( + ctx->get(), &mem_descr, /*&host_descr, */ bytes_count, alignment, handle, &out_ptr); if (ret != ZE_RESULT_SUCCESS) { CCL_THROW("cannot allocate memory, error: " + std::to_string(ret)); } @@ -516,10 +513,14 @@ void CCL_BE_API ccl_device::on_delete(ze_ipc_mem_handle_t& ipc_mem_handle, */ //todo thread safety - for (auto ipc_it = ipc_storage.begin(); ipc_it != ipc_storage.end(); ++ipc_it) { - if (!strncmp(ipc_it->second->handle.data, ipc_mem_handle.data, ZE_MAX_IPC_HANDLE_SIZE)) { - ipc_storage.erase(ipc_it); + for (auto ipc_it = ipc_storage.begin(); ipc_it != ipc_storage.end();) { + if (ipc_it->second) { + if (!memcmp(ipc_it->second->handle.data, ipc_mem_handle.data, ZE_MAX_IPC_HANDLE_SIZE)) { + ipc_it = ipc_storage.erase(ipc_it); + continue; + } } + ++ipc_it; } } @@ -530,7 +531,7 @@ CCL_BE_API ccl_device::device_ipc_memory ccl_device::get_ipc_memory( //, this, // ", expected device: ", ipc_handle.get_owner()); - ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD; + ze_ipc_memory_flags_t flag = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED; ip_memory_elem_t ipc_memory{}; if (!ctx) { @@ -558,7 +559,7 @@ CCL_BE_API std::shared_ptr<ccl_device::device_ipc_memory> ccl_device::restore_sh std::shared_ptr<device_ipc_memory_handle>&& ipc_handle, std::shared_ptr<ccl_context> ctx) { assert(ipc_handle->get_owner().lock().get() == this && "IPC handle doesn't belong to device: "); - ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD; + ze_ipc_memory_flags_t flag = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED; ip_memory_elem_t ipc_memory{}; if (!ctx) { diff --git a/src/native_device_api/l0/event_pool.cpp b/src/native_device_api/l0/event_pool.cpp new file mode 100644 index 000000000..c86c065f3 --- /dev/null +++ b/src/native_device_api/l0/event_pool.cpp @@ -0,0 +1,167 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "oneapi/ccl/native_device_api/export_api.hpp" +#include "oneapi/ccl/native_device_api/l0/context.hpp" +#include "oneapi/ccl/native_device_api/l0/base_impl.hpp" +#include "oneapi/ccl/native_device_api/l0/device.hpp" +#include "oneapi/ccl/native_device_api/l0/event_pool.hpp" +#include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp" +#include "oneapi/ccl/native_device_api/l0/driver.hpp" +#include "oneapi/ccl/native_device_api/l0/platform.hpp" +#include "common/log/log.hpp" + +namespace native { + +// event pool +ccl_event_pool::ccl_event_pool(const ze_event_pool_desc_t& descr, + handle_t h, + owner_ptr_t&& holder, + context_ptr_t&& ctx) + : base(h, std::move(holder), std::move(ctx)), + pool_description(descr), + allocated_event_count(0) {} + +ccl_event_pool::~ccl_event_pool() { + CCL_ASSERT(allocated_event_count.load() == 0, + "there are in-use event objects during ccl_event_pool destruction"); +} + +const ze_event_pool_desc_t& ccl_event_pool::get_pool_description() const { + return pool_description; +} + +size_t ccl_event_pool::get_allocated_events() const { + return allocated_event_count.load(); +} + +ccl_event_pool::event_ptr ccl_event_pool::create_event(const ze_event_desc_t& descr) { + ze_event_handle_t event_handle; + ze_result_t ret = zeEventCreate(get(), &descr, &event_handle); + if (ret != ZE_RESULT_SUCCESS) { + CCL_THROW("cannot execute zeEventCreate, error: " + native::to_string(ret)); + } + event_ptr event_ret(new event(event_handle, get_ptr(), get_ctx())); + allocated_event_count.fetch_add(1); + return event_ret; +} + +void ccl_event_pool::on_delete(ze_event_handle_t event_handle, ze_context_handle_t& ctx) { + (void)ctx; + ze_result_t ret = zeEventDestroy(event_handle); + if (ret != ZE_RESULT_SUCCESS) { + CCL_THROW("cannot execute zeEventDestroy, error: " + native::to_string(ret)); + } + + allocated_event_count.fetch_sub(1); +} + +// Thread safe array +CCL_BE_API event_pool_array_t::context_array_accessor event_pool_array_t::access() { + return context_array_accessor(m, event_pools); +} + +CCL_BE_API event_pool_array_t::const_context_array_accessor event_pool_array_t::access() const { + return const_context_array_accessor(m, event_pools); +} + +// Thread safe context storage holder +ze_event_pool_handle_t ccl_event_pool_holder::get() { + return nullptr; +} + +std::shared_ptr<ccl_event_pool> ccl_event_pool_holder::emplace( + const std::initializer_list<ccl_device*>& devices, + std::shared_ptr<ccl_event_pool> pool) { + std::unique_lock<std::mutex> lock(m); //TODO use shared lock + + if (devices.size() != 0) { + for (ccl_device* d : devices) { + event_pool_array_t& cont = contexts_pool[d]; + auto acc = cont.access(); + acc.get().push_back(pool); + } + } + else { + event_pool_array_t& cont = contexts_pool[nullptr]; + auto acc = cont.access(); + acc.get().push_back(pool); + } + return pool; +} + +CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> +ccl_event_pool_holder::get_event_pool_storage(std::initializer_list<ccl_device*> devices) { + return static_cast<const ccl_event_pool_holder*>(this)->get_event_pool_storage(devices); +} + +CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> +ccl_event_pool_holder::get_event_pool_storage(std::initializer_list<ccl_device*> devices) const { + using pool_array = std::vector<std::shared_ptr<ccl_event_pool>>; + pool_array shared_pool; + + std::unique_lock<std::mutex> lock(m); //TODO use simple shared lock + + if (devices.size() == 0) { + auto it = contexts_pool.find(nullptr); + if (it != contexts_pool.end()) { + shared_pool = it->second.access().get(); + } + } + else { + for (ccl_device* d : devices) { + auto it = contexts_pool.find(d); + if (it == contexts_pool.end()) { + CCL_THROW("cannot find event_pool for device: " + d->to_string() + + "\nTotal contexts_pool count: " + std::to_string(contexts_pool.size())); + } + + auto acc = it->second.access(); + auto& event_pools = acc.get(); + + //find common pools for devices + if (shared_pool.empty()) { + // copy + shared_pool = event_pools; + continue; + } + + //find intersection + pool_array intersected; + std::set_intersection(event_pools.begin(), + event_pools.end(), + shared_pool.begin(), + shared_pool.end(), + std::back_inserter(intersected)); + shared_pool.swap(intersected); + + // nothing to do + if (shared_pool.empty()) { + break; + } + } + } + return shared_pool; +} + +void ccl_event_pool_holder::on_delete(ze_event_pool_handle_t pool_handle, + ze_context_handle_t& ctx) { + (void)ctx; + ze_result_t ret = zeEventPoolDestroy(pool_handle); + if (ret != ZE_RESULT_SUCCESS) { + CCL_THROW("cannot execute zeEventPoolDestroy, error: " + native::to_string(ret)); + } +} +} // namespace native diff --git a/src/native_device_api/l0/primitives.cpp b/src/native_device_api/l0/primitives.cpp index c9a7a09c4..3387ed48e 100644 --- a/src/native_device_api/l0/primitives.cpp +++ b/src/native_device_api/l0/primitives.cpp @@ -30,6 +30,27 @@ #include "common/log/log.hpp" namespace native { + +bool event::wait(uint64_t nanosec) const { + (void)nanosec; + ze_result_t ret = zeEventHostSynchronize(handle, nanosec); + if (ret != ZE_RESULT_SUCCESS && ret != ZE_RESULT_NOT_READY) { + CCL_THROW("zeEventHostSynchronize, error: " + native::to_string(ret)); + } + return ret == ZE_RESULT_SUCCESS; +} + +ze_result_t event::status() const { + return zeEventQueryStatus(handle); +} + +void event::signal() { + ze_result_t ret = zeEventHostSignal(handle); + if (ret != ZE_RESULT_SUCCESS) { + CCL_THROW("zeEventHostSignal, error: " + native::to_string(ret)); + } +} + namespace detail { CCL_BE_API void copy_memory_sync_unsafe(void* dst, const void* src, @@ -46,7 +67,7 @@ CCL_BE_API void copy_memory_sync_unsafe(void* dst, ze_command_queue_desc_t queue_description = device->get_default_queue_desc(); //queue_description.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; TODO may be &= for flags??? - auto queue = device->create_cmd_queue(ctx, queue_description); + auto& queue = device->get_cmd_queue(queue_description, ctx); //create list ze_command_list_desc_t list_description = device->get_default_list_desc(); @@ -96,6 +117,28 @@ CCL_BE_API void copy_memory_sync_unsafe(void* dst, memcpy(dst, src, size); } +/* + +event<ccl_device, ccl_context> +copy_memory_async_unsafe(void* dst, + const void* src, + size_t size, + std::weak_ptr<ccl_device> device_weak, + std::shared_ptr<ccl_context> ctx, + queue<ccl_device, ccl_context>& q); +event<ccl_device, ccl_context> +copy_memory_async_unsafe(void* dst, + const void* src, + size_t size, + std::weak_ptr<ccl_context> ctx_weak, + std::shared_ptr<ccl_context> ctx, + queue<ccl_device, ccl_context>& q) +{ + (void)q; + copy_memory_sync_unsafe(dst, src, size, ctx_weak, ctx); + event<ccl_device, ccl_context> e(h, get_ptr(), ctx); +} +*/ } // namespace detail std::string get_build_log_string(const ze_module_build_log_handle_t& build_log) { diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp index 7361df9a6..afb0b5bd9 100644 --- a/src/parallelizer/parallelizer.cpp +++ b/src/parallelizer/parallelizer.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include <algorithm> +#include <numeric> #include "coll/selection/selection.hpp" #include "common/global/global.hpp" @@ -82,6 +83,159 @@ ccl::status ccl_parallelizer_prologue_get_dtype(const void* ctx, void* field_ptr } ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { + process_base(sched); + +#ifdef CCL_ENABLE_SYCL + ccl_coll_param& coll_param = sched->coll_param; + if (coll_param.stream && coll_param.stream->is_sycl_device_stream() && + (coll_param.device_send_buf || coll_param.device_recv_buf)) { + process_pre_post_copies(sched); + } +#endif /* CCL_ENABLE_SYCL */ + + /* should be the last call in the sequence of process_* calls + because it sets dependencies for all partial schedules + which already should be filled */ + process_deps(sched); + + return ccl::status::success; +} + +ccl::status ccl_parallelizer::process_deps(ccl_master_sched* sched) { + auto& part_scheds = sched->partial_scheds; + ccl_sched* deps_sched = part_scheds[0].get(); + size_t part_count = part_scheds.size(); + + for (size_t idx = 0; idx < part_count; idx++) { + part_scheds[idx]->set_add_mode(ccl_sched_add_front); + } + sched->sync_partial_scheds(); + + entry_factory::make_entry<deps_entry>(deps_sched); + deps_sched->add_barrier(); + + return ccl::status::success; +} + +#ifdef CCL_ENABLE_SYCL +ccl::status ccl_parallelizer::process_pre_post_copies(ccl_master_sched* sched) { + auto& part_scheds = sched->partial_scheds; + ccl_sched* copy_sched = part_scheds[0].get(); + size_t part_count = part_scheds.size(); + + ccl_coll_param& coll_param = sched->coll_param; + ccl_comm* comm = coll_param.comm; + int comm_size = comm->size(); + int my_rank = comm->rank(); + + const ccl_datatype& dtype = coll_param.dtype; + size_t dtype_size = dtype.size(); + + ccl_coll_type coll_type = coll_param.ctype; + + size_t d2h_bytes = 0, h2d_bytes = 0; + size_t d2h_count = 0, h2d_count = 0; + + void* device_in_buf = nullptr; + void* device_out_buf = nullptr; + void* host_in_buf = nullptr; + void* host_out_buf = nullptr; + + size_t device_in_buf_offset = 0; + + switch (coll_type) { + case ccl_coll_bcast: + if (comm->rank() == coll_param.root) + d2h_count = coll_param.count; + else + d2h_count = 0; + h2d_count = coll_param.count; + break; + + case ccl_coll_reduce: + d2h_count = coll_param.count; + if (my_rank == coll_param.root) + h2d_count = coll_param.count; + else + h2d_count = 0; + break; + + case ccl_coll_reduce_scatter: + d2h_count = coll_param.count * comm_size; + h2d_count = coll_param.count; + break; + + case ccl_coll_allreduce: d2h_count = h2d_count = coll_param.count; break; + + case ccl_coll_allgatherv: + if (coll_param.device_send_buf == coll_param.device_recv_buf) { + device_in_buf_offset = + std::accumulate(coll_param.recv_counts, coll_param.recv_counts + my_rank, 0); + LOG_TRACE("device_in_buf_offset = ", device_in_buf_offset); + } + d2h_count = coll_param.send_count; + h2d_count = + std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0); + break; + + case ccl_coll_alltoall: d2h_count = h2d_count = coll_param.count * comm_size; break; + case ccl_coll_alltoallv: + d2h_count = + std::accumulate(coll_param.send_counts, coll_param.send_counts + comm_size, 0); + h2d_count = + std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0); + break; + + default: CCL_FATAL("unexpected coll_type ", coll_type); break; + } + + device_in_buf = &(coll_param.device_send_buf); + host_in_buf = (void*)coll_param.send_buf; + d2h_bytes = d2h_count * dtype_size; + + host_out_buf = coll_param.recv_buf; + device_out_buf = &(coll_param.device_recv_buf); + h2d_bytes = h2d_count * dtype_size; + + if (d2h_bytes) { + for (size_t idx = 0; idx < part_count; idx++) { + part_scheds[idx]->set_add_mode(ccl_sched_add_front); + } + sched->sync_partial_scheds(); + + entry_factory::make_entry<sycl_copy_entry>( + copy_sched, + copy_direction::d2h, + ccl_buffer(device_in_buf, d2h_bytes, ccl_buffer_type::INDIRECT), + ccl_buffer(host_in_buf, d2h_bytes), + d2h_count, + dtype, + coll_param.stream, + device_in_buf_offset); + } + + if (h2d_bytes) { + for (size_t idx = 0; idx < part_count; idx++) { + part_scheds[idx]->set_add_mode(ccl_sched_add_back); + } + sched->sync_partial_scheds(); + + entry_factory::make_entry<sycl_copy_entry>( + copy_sched, + copy_direction::h2d, + ccl_buffer(host_out_buf, h2d_bytes), + ccl_buffer(device_out_buf, h2d_bytes, ccl_buffer_type::INDIRECT), + h2d_count, + dtype, + coll_param.stream); + part_scheds[0]->add_barrier(); + } + + return ccl::status::success; +} +#endif /* CCL_ENABLE_SYCL */ + +ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) { /* TODO: split on per-collective classes */ CCL_ASSERT(sched); @@ -238,6 +392,7 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { ccl_coll_param part_coll_param{}; part_coll_param.ctype = sched->coll_param.ctype; part_coll_param.dtype = sched->coll_param.dtype; + part_coll_param.stream = sched->coll_param.stream; part_coll_param.comm = comm; sched->add_partial_sched(part_coll_param); } @@ -273,10 +428,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { if (coll_type == ccl_coll_alltoallv) { CCL_ASSERT(coll_param.send_counts); CCL_ASSERT(coll_param.recv_counts); - for (idx = 0; idx < comm_size; idx++) { - a2av_send_count += coll_param.send_counts[idx]; - a2av_recv_count += coll_param.recv_counts[idx]; - } + a2av_send_count = + std::accumulate(coll_param.send_counts, coll_param.send_counts + comm_size, 0); + a2av_recv_count = + std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0); } else { a2av_send_count = coll_param.count * comm_size; @@ -292,19 +447,16 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { offsets[0] = 0; if (ag_algo == ccl_coll_allgatherv_direct || ag_algo == ccl_coll_allgatherv_naive || ag_algo == ccl_coll_allgatherv_ring) { - for (idx = 0; idx < comm_size; idx++) - ag_recv_count += recv_counts[idx]; - ag_recv_bytes = ag_recv_count * dtype_size; } else { - ag_recv_count = counts[0]; for (idx = 1; idx < comm_size; idx++) { counts[idx] = recv_counts[idx]; offsets[idx] = offsets[idx - 1] + counts[idx - 1] * dtype_size; - ag_recv_count += counts[idx]; } - ag_recv_bytes = ag_recv_count * dtype_size; } + ag_recv_count = + std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0); + ag_recv_bytes = ag_recv_count * dtype_size; break; default: CCL_FATAL("unexpected coll_type ", coll_type); break; } @@ -322,71 +474,23 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { sched->sync_partial_scheds(); break; case ccl_coll_bcast: -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - if (comm->rank() == coll_param.root) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer(&(coll_param.sycl_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - ccl_buffer(coll_param.buf, coll_param.count * dtype_size), - coll_param.count, - dtype, - coll_param.stream); - } - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ for (idx = 0; idx < part_count; idx++) { ccl_coll_entry_param param{}; param.ctype = ccl_coll_bcast; - param.buf = ccl_buffer(&(coll_param.buf), - coll_param.count * dtype_size, - offsets[idx], - ccl_buffer_type::INDIRECT); + param.recv_buf = ccl_buffer(&(coll_param.recv_buf), + coll_param.count * dtype_size, + offsets[idx], + ccl_buffer_type::INDIRECT); param.count = counts[idx]; param.dtype = dtype; param.root = coll_param.root; param.comm = comm; coll_entry_helper::add_coll_entry<ccl_coll_bcast>(part_scheds[idx].get(), param); } -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.buf, coll_param.count * dtype_size), - ccl_buffer(&(coll_param.sycl_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - coll_param.count, - dtype, - coll_param.stream); - } -#endif /* CCL_ENABLE_SYCL */ break; case ccl_coll_reduce: for (idx = 0; idx < part_count; idx++) { -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer(&(coll_param.sycl_send_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - ccl_buffer((void*)coll_param.send_buf, coll_param.count * dtype_size), - coll_param.count, - dtype, - coll_param.stream); - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ - ccl_coll_entry_param param{}; param.ctype = ccl_coll_reduce; param.send_buf = ccl_buffer(&(coll_param.send_buf), @@ -404,44 +508,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { param.comm = comm; coll_entry_helper::add_coll_entry<ccl_coll_reduce>(part_scheds[idx].get(), param); } -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - if (comm->rank() == coll_param.root) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size), - ccl_buffer(&(coll_param.sycl_recv_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - coll_param.count, - dtype, - coll_param.stream); - } - } -#endif /* CCL_ENABLE_SYCL */ break; case ccl_coll_reduce_scatter: for (idx = 0; idx < part_count; idx++) { -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer(&(coll_param.sycl_send_buf), - coll_param.count * comm_size * dtype_size, - ccl_buffer_type::INDIRECT), - ccl_buffer((void*)coll_param.send_buf, - coll_param.count * comm_size * dtype_size), - coll_param.count * comm_size, - dtype, - coll_param.stream); - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ - ccl_coll_entry_param param{}; param.ctype = ccl_coll_reduce_scatter; @@ -463,42 +533,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { coll_entry_helper::add_coll_entry<ccl_coll_reduce_scatter>(part_scheds[idx].get(), param); } -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size), - ccl_buffer(&(coll_param.sycl_recv_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - coll_param.count, - dtype, - coll_param.stream); - } -#endif /* CCL_ENABLE_SYCL */ break; case ccl_coll_allreduce: { ccl_parallelizer_prologue_ctx* main_ctx = nullptr; - -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer(&(coll_param.sycl_send_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - ccl_buffer((void*)coll_param.send_buf, coll_param.count * dtype_size), - coll_param.count, - dtype, - coll_param.stream); - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ - if (coll_attr->prologue_fn) { part_ctxs.reserve(part_count); @@ -622,49 +660,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { ccl_parallelizer_prologue_get_dtype, main_ctx, false); } } - -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size), - ccl_buffer(&(coll_param.sycl_recv_buf), - coll_param.count * dtype_size, - ccl_buffer_type::INDIRECT), - coll_param.count, - dtype, - coll_param.stream); - } -#endif /* CCL_ENABLE_SYCL */ - break; } + case ccl_coll_allgatherv: { -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - size_t sycl_buf_offset = 0; - if (coll_param.sycl_send_buf == coll_param.sycl_recv_buf) { - for (int i = 0; i < my_rank; i++) { - sycl_buf_offset += coll_param.recv_counts[i]; - } - LOG_TRACE("sycl_buf_offset = ", sycl_buf_offset); - } - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer(&(coll_param.sycl_send_buf), - coll_param.send_count * dtype_size, - ccl_buffer_type::INDIRECT), - ccl_buffer((void*)coll_param.send_buf, coll_param.send_count * dtype_size), - coll_param.send_count, - dtype, - coll_param.stream, - sycl_buf_offset); - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ if (ag_algo == ccl_coll_allgatherv_direct || ag_algo == ccl_coll_allgatherv_naive || ag_algo == ccl_coll_allgatherv_ring) { ccl_coll_entry_param param{}; @@ -776,7 +775,7 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { for (idx = 0; idx < comm_size; idx++) { ccl_coll_entry_param param{}; param.ctype = ccl_coll_bcast; - param.buf = ag_recv_bufs[idx]; + param.recv_buf = ag_recv_bufs[idx]; param.count = counts[idx]; param.dtype = dtype; param.root = idx; @@ -786,39 +785,11 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { } } } -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.recv_buf, ag_recv_bytes), - ccl_buffer( - &(coll_param.sycl_recv_buf), ag_recv_bytes, ccl_buffer_type::INDIRECT), - ag_recv_count, - dtype, - coll_param.stream); - } -#endif /* CCL_ENABLE_SYCL */ break; } + case ccl_coll_alltoall: case ccl_coll_alltoallv: { -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>( - part_scheds[0].get(), - ccl_buffer( - &(coll_param.sycl_send_buf), a2av_send_bytes, ccl_buffer_type::INDIRECT), - ccl_buffer((void*)coll_param.send_buf, a2av_send_bytes), - a2av_send_count, - dtype, - coll_param.stream); - sched->sync_partial_scheds(); - } -#endif /* CCL_ENABLE_SYCL */ - if (a2a_algo == ccl_coll_alltoall_naive || a2av_algo == ccl_coll_alltoallv_naive) { ccl_coll_build_naive_alltoallv(sched, part_scheds_vector, coll_param); } @@ -852,22 +823,9 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) { param); } } -#ifdef CCL_ENABLE_SYCL - /* convert sycl buffer */ - if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) { - sched->sync_partial_scheds(); - entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>( - part_scheds[0].get(), - ccl_buffer(coll_param.recv_buf, a2av_recv_bytes), - ccl_buffer( - &(coll_param.sycl_recv_buf), a2av_recv_bytes, ccl_buffer_type::INDIRECT), - a2av_recv_count, - dtype, - coll_param.stream); - } -#endif /* CCL_ENABLE_SYCL */ break; } + case ccl_coll_sparse_allreduce: { ccl_parallelizer_sparse_callback_ctx* i_ctx = (ccl_parallelizer_sparse_callback_ctx*)part_scheds[0] diff --git a/src/parallelizer/parallelizer.hpp b/src/parallelizer/parallelizer.hpp index 008bccd42..6c3d30e44 100644 --- a/src/parallelizer/parallelizer.hpp +++ b/src/parallelizer/parallelizer.hpp @@ -34,5 +34,13 @@ class ccl_parallelizer { ccl::status process(ccl_master_sched* sched); private: + ccl::status process_deps(ccl_master_sched* sched); + +#ifdef CCL_ENABLE_SYCL + ccl::status process_pre_post_copies(ccl_master_sched* sched); +#endif /* CCL_ENABLE_SYCL */ + + ccl::status process_base(ccl_master_sched* sched); + size_t max_data_partition_count; }; diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp index 0fc91107c..7a97b75c0 100644 --- a/src/sched/entry/coll/coll_entry.cpp +++ b/src/sched/entry/coll/coll_entry.cpp @@ -24,8 +24,9 @@ void coll_entry::start() { if (!coll_sched) { ccl_coll_param coll_param{}; - coll_param.comm = sched->coll_param.comm; coll_param.ctype = param.ctype; + coll_param.comm = sched->coll_param.comm; + coll_param.stream = sched->coll_param.stream; coll_sched.reset(new ccl_extra_sched(coll_param, sched->sched_id)); coll_sched->set_op_id(coll_sched_op_id); diff --git a/src/sched/entry/coll/coll_entry.hpp b/src/sched/entry/coll/coll_entry.hpp index 6b8fdfa10..d3fa6bdb5 100644 --- a/src/sched/entry/coll/coll_entry.hpp +++ b/src/sched/entry/coll/coll_entry.hpp @@ -15,12 +15,13 @@ */ #pragma once +#include "common/global/global.hpp" +#include "comp/comp.hpp" #include "sched/entry/coll/coll_entry_param.hpp" #include "sched/entry/entry.hpp" class coll_entry : public sched_entry, public postponed_fields<coll_entry, - ccl_sched_entry_field_buf, ccl_sched_entry_field_send_buf, ccl_sched_entry_field_recv_buf, ccl_sched_entry_field_cnt, @@ -53,10 +54,6 @@ class coll_entry : public sched_entry, return class_name(); } - ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_buf> id) { - return param.buf; - } - ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_send_buf> id) { return param.send_buf; } @@ -84,8 +81,6 @@ class coll_entry : public sched_entry, ccl::global_data::get().dtypes->name(param.dtype), ", coll_type ", ccl_coll_type_to_str(param.ctype), - ", buf ", - param.buf, ", send_buf ", param.send_buf, ", recv_buf ", diff --git a/src/sched/entry/coll/coll_entry_helper.cpp b/src/sched/entry/coll/coll_entry_helper.cpp index 426e3a1aa..344f73b92 100644 --- a/src/sched/entry/coll/coll_entry_helper.cpp +++ b/src/sched/entry/coll/coll_entry_helper.cpp @@ -71,7 +71,7 @@ ccl::status coll_entry_helper::build_schedule(ccl_sched* sched, } case ccl_coll_bcast: { res = ccl_coll_build_bcast( - sched, param.buf, param.count, param.dtype, param.root, param.comm); + sched, param.recv_buf, param.count, param.dtype, param.root, param.comm); break; } case ccl_coll_reduce: { diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/sched/entry/coll/coll_entry_param.hpp index 64ce1d97d..431bf0997 100644 --- a/src/sched/entry/coll/coll_entry_param.hpp +++ b/src/sched/entry/coll/coll_entry_param.hpp @@ -19,7 +19,6 @@ struct ccl_coll_entry_param { ccl_coll_type ctype; - ccl_buffer buf; ccl_buffer send_buf; ccl_buffer recv_buf; size_t count; @@ -30,20 +29,22 @@ struct ccl_coll_entry_param { ccl::reduction reduction; int root; ccl_comm* comm; + ccl_stream* stream; ccl_coll_param to_coll_param() const { ccl_coll_param param; param.ctype = ctype; - param.buf = buf.get_ptr(); param.send_buf = send_buf.get_ptr(); param.recv_buf = recv_buf.get_ptr(); param.count = count; param.send_count = send_count; + param.send_counts = send_counts; param.recv_counts = recv_counts; param.dtype = dtype; param.reduction = reduction; param.root = root; param.comm = comm; + param.stream = stream; return param; } }; diff --git a/src/sched/entry/coll/direct/alltoallv_entry.hpp b/src/sched/entry/coll/direct/alltoallv_entry.hpp index 8e13f7a20..882a38e4b 100644 --- a/src/sched/entry/coll/direct/alltoallv_entry.hpp +++ b/src/sched/entry/coll/direct/alltoallv_entry.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include "common/comm/comm.hpp" #include "sched/entry/coll/direct/base_coll_entry.hpp" class alltoallv_entry : public base_coll_entry { diff --git a/src/sched/entry/coll/direct/base_coll_entry.hpp b/src/sched/entry/coll/direct/base_coll_entry.hpp index 7c8284e12..96648371c 100644 --- a/src/sched/entry/coll/direct/base_coll_entry.hpp +++ b/src/sched/entry/coll/direct/base_coll_entry.hpp @@ -16,6 +16,7 @@ #pragma once #include "sched/entry/entry.hpp" +#include "sched/queue/queue.hpp" class base_coll_entry : public sched_entry { public: diff --git a/src/sched/entry/copy/copy_entry.hpp b/src/sched/entry/copy/copy_entry.hpp new file mode 100644 index 000000000..d7a1e95e2 --- /dev/null +++ b/src/sched/entry/copy/copy_entry.hpp @@ -0,0 +1,162 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include "sched/entry/copy/copy_helper.hpp" +#include "sched/entry/entry.hpp" + +#ifdef CCL_ENABLE_SYCL +#include <CL/sycl.hpp> +#endif /* CCL_ENABLE_SYCL */ + +class copy_entry : public sched_entry, + public postponed_fields<copy_entry, + ccl_sched_entry_field_in_buf, + ccl_sched_entry_field_cnt, + ccl_sched_entry_field_dtype> { +public: + static constexpr const char* class_name() noexcept { + return "COPY"; + } + + copy_entry() = delete; + copy_entry(ccl_sched* sched, + const ccl_buffer in_buf, + ccl_buffer out_buf, + size_t count, + const ccl_datatype& dtype, + size_t in_buf_offset = 0) + : sched_entry(sched), + in_buf(in_buf), + out_buf(out_buf), + count(count), + dtype(dtype), + in_buf_offset(in_buf_offset) {} + + void start() override { + update_fields(); + +#ifdef CCL_ENABLE_SYCL + ccl_stream* stream = (ccl_stream*)sched->coll_param.stream; + + if (!stream) { + do_regular_copy(); + return; + } + + sycl::queue* q = stream->get_native_stream(sched->queue->get_idx()); + CCL_THROW_IF_NOT(q, "null sycl queue"); + auto in_ptr_type = sycl::get_pointer_type(in_buf.get_ptr(), q->get_context()); + auto out_ptr_type = sycl::get_pointer_type(out_buf.get_ptr(), q->get_context()); + + LOG_DEBUG("in_ptr_type: ", + native::detail::usm_to_string(in_ptr_type), + ", out_ptr_type: ", + native::detail::usm_to_string(out_ptr_type), + ", native_stream: ", + stream->to_string(), + ", count: ", + count) + + if ((in_ptr_type != sycl::usm::alloc::device) && + (out_ptr_type != sycl::usm::alloc::device)) { + do_regular_copy(); + return; + } + + copy_direction direction; + + if ((in_ptr_type == sycl::usm::alloc::device) && + (out_ptr_type == sycl::usm::alloc::device)) { + direction = copy_direction::d2d; + } + + if ((in_ptr_type == sycl::usm::alloc::host) && (out_ptr_type == sycl::usm::alloc::device)) { + direction = copy_direction::h2d; + } + + if ((in_ptr_type == sycl::usm::alloc::device) && (out_ptr_type == sycl::usm::alloc::host)) { + direction = copy_direction::d2h; + } + + copier = sycl_copier(direction, in_buf, out_buf, count, dtype, 0); + copier.set_queue(q); + ccl_tuple_for_each_indexed<ccl_sycl_buffer_one_dim_types>(copier); + status = ccl_sched_entry_status_started; +#else /* CCL_ENABLE_SYCL */ + do_regular_copy(); +#endif /* CCL_ENABLE_SYCL */ + } + + void update() override { +#ifdef CCL_ENABLE_SYCL + if (copier.is_completed()) { + status = ccl_sched_entry_status_complete; + } +#endif /* CCL_ENABLE_SYCL */ + } + + void do_regular_copy() { + size_t bytes = count * dtype.size(); + auto comp_status = + ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), count, dtype); + CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status); + status = ccl_sched_entry_status_complete; + } + + const char* name() const override { + return class_name(); + } + + ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_in_buf> id) { + return in_buf; + } + + size_t& get_field_ref(field_id_t<ccl_sched_entry_field_cnt> id) { + return count; + } + + ccl_datatype& get_field_ref(field_id_t<ccl_sched_entry_field_dtype> id) { + return dtype; + } + +protected: + void dump_detail(std::stringstream& str) const override { + ccl_logger::format(str, + "dt ", + ccl::global_data::get().dtypes->name(dtype), + ", count ", + count, + ", in_buf ", + in_buf, + ", out_buf ", + out_buf, + ", in_buf_offset ", + in_buf_offset, + "\n"); + } + +private: + ccl_buffer in_buf; + ccl_buffer out_buf; + size_t count; + ccl_datatype dtype; + size_t in_buf_offset; + +#ifdef CCL_ENABLE_SYCL + sycl_copier copier; +#endif /* CCL_ENABLE_SYCL */ +}; diff --git a/src/sched/entry/sycl_entry_helper.cpp b/src/sched/entry/copy/copy_helper.cpp similarity index 66% rename from src/sched/entry/sycl_entry_helper.cpp rename to src/sched/entry/copy/copy_helper.cpp index d5beb2552..8854d22de 100644 --- a/src/sched/entry/sycl_entry_helper.cpp +++ b/src/sched/entry/copy/copy_helper.cpp @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "sched/entry/sycl_entry_helper.hpp" +#include "sched/entry/copy/copy_helper.hpp" -using sycl_copy_direction_str_enum = - utils::enum_to_str<utils::enum_to_underlying(sycl_copy_direction::h2d) + 1>; -std::string to_string(sycl_copy_direction val) { - return sycl_copy_direction_str_enum({ "D2H", "H2D" }).choose(val, "UNKNOWN"); +using copy_direction_str_enum = + utils::enum_to_str<utils::enum_to_underlying(copy_direction::d2d) + 1>; +std::string to_string(copy_direction val) { + return copy_direction_str_enum({ "D2H", "H2D", "D2D" }).choose(val, "UNKNOWN"); } diff --git a/src/sched/entry/sycl_entry_helper.hpp b/src/sched/entry/copy/copy_helper.hpp similarity index 76% rename from src/sched/entry/sycl_entry_helper.hpp rename to src/sched/entry/copy/copy_helper.hpp index 1a273198f..ec8a44e33 100644 --- a/src/sched/entry/sycl_entry_helper.hpp +++ b/src/sched/entry/copy/copy_helper.hpp @@ -22,20 +22,21 @@ #include "common/utils/tuple.hpp" #include "oneapi/ccl/native_device_api/interop_utils.hpp" -enum class sycl_copy_direction { d2h, h2d }; - -std::string to_string(sycl_copy_direction val); +enum class copy_direction { d2h, h2d, d2d }; +std::string to_string(copy_direction val); #ifdef CCL_ENABLE_SYCL -template <sycl_copy_direction direction> struct sycl_copier { - sycl_copier(ccl_buffer in_buf, + sycl_copier() = default; + sycl_copier(copy_direction direction, + ccl_buffer in_buf, ccl_buffer out_buf, size_t count, const ccl_datatype& dtype, size_t in_buf_offset) - : in_buf(in_buf), + : direction(direction), + in_buf(in_buf), out_buf(out_buf), count(count), dtype(dtype), @@ -68,8 +69,19 @@ struct sycl_copier { void* in_buf_ptr = in_buf.get_ptr(bytes); void* out_buf_ptr = out_buf.get_ptr(bytes); - void* void_device_ptr = - (direction == sycl_copy_direction::h2d) ? out_buf_ptr : in_buf_ptr; + size_t offset = in_buf_offset; + + if (direction == copy_direction::d2d) { + e = q->submit([&](sycl::handler& h) { + h.memcpy(out_buf_ptr, + static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) + + offset, + bytes); + }); + return; + } + + void* void_device_ptr = (direction == copy_direction::h2d) ? out_buf_ptr : in_buf_ptr; /* don't print this pointer through CCL logger @@ -83,6 +95,7 @@ struct sycl_copier { auto device_ptr_type = sycl::get_pointer_type(device_ptr, q->get_context()); CCL_THROW_IF_NOT((device_ptr_type == sycl::usm::alloc::device || + device_ptr_type == sycl::usm::alloc::shared || device_ptr_type == sycl::usm::alloc::unknown), "unexpected USM type ", native::detail::usm_to_string(device_ptr_type), @@ -91,13 +104,13 @@ struct sycl_copier { specific_sycl_buffer* device_buf_ptr = nullptr; - if (device_ptr_type == sycl::usm::alloc::device) { - /* do nothing, provided device USM pointer can be used as is in copy kernel */ - } - else { + if (device_ptr_type == sycl::usm::alloc::unknown) { /* cast pointer into SYCL buffer */ device_buf_ptr = static_cast<specific_sycl_buffer*>(void_device_ptr); } + else { + /* do nothing, provided USM pointer can be used as is in copy kernel */ + } LOG_DEBUG("count: ", count, @@ -120,20 +133,16 @@ struct sycl_copier { ", device_ptr usm_type: ", native::detail::usm_to_string(device_ptr_type)); - size_t offset = in_buf_offset; - if (device_buf_ptr) { specific_sycl_buffer host_buf( static_cast<typename specific_sycl_buffer::value_type*>( - (direction == sycl_copy_direction::h2d) ? in_buf_ptr : out_buf_ptr), + (direction == copy_direction::h2d) ? in_buf_ptr : out_buf_ptr), count, sycl::property::buffer::use_host_ptr{}); e = q->submit([&](sycl::handler& h) { - auto& src_buf = - (direction == sycl_copy_direction::h2d) ? host_buf : *device_buf_ptr; - auto& dst_buf = - (direction == sycl_copy_direction::h2d) ? *device_buf_ptr : host_buf; + auto& src_buf = (direction == copy_direction::h2d) ? host_buf : *device_buf_ptr; + auto& dst_buf = (direction == copy_direction::h2d) ? *device_buf_ptr : host_buf; auto src_buf_acc = src_buf.template get_access<sycl::access::mode::read>(h, count, offset); auto dst_buf_acc = dst_buf.template get_access<sycl::access::mode::write>(h); @@ -141,13 +150,12 @@ struct sycl_copier { }); } else { - e = q->memcpy( - out_buf_ptr, - static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) + offset, - count * dtype.size()); - - /* TODO: remove explicit wait */ - e.wait(); + e = q->submit([&](sycl::handler& h) { + h.memcpy(out_buf_ptr, + static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) + + offset, + bytes); + }); } } else { @@ -160,10 +168,11 @@ struct sycl_copier { } } + copy_direction direction; ccl_buffer in_buf; ccl_buffer out_buf; size_t count; - const ccl_datatype& dtype; + ccl_datatype dtype; sycl::queue* q; size_t in_buf_offset; sycl::event e; diff --git a/src/sched/entry/sycl_copy_entry.hpp b/src/sched/entry/copy/sycl_copy_entry.hpp similarity index 82% rename from src/sched/entry/sycl_copy_entry.hpp rename to src/sched/entry/copy/sycl_copy_entry.hpp index 8852e8a56..2ae7a2729 100644 --- a/src/sched/entry/sycl_copy_entry.hpp +++ b/src/sched/entry/copy/sycl_copy_entry.hpp @@ -17,18 +17,20 @@ #ifdef CCL_ENABLE_SYCL +#include "sched/entry/copy/copy_helper.hpp" #include "sched/entry/entry.hpp" -#include "sched/entry/sycl_entry_helper.hpp" #include <CL/sycl.hpp> -template <sycl_copy_direction direction> class sycl_copy_entry : public sched_entry { public: - static constexpr const char* class_name() noexcept; + static constexpr const char* class_name() noexcept { + return "SYCL_COPY"; + } sycl_copy_entry() = delete; sycl_copy_entry(ccl_sched* sched, + copy_direction direction, ccl_buffer in_buf, ccl_buffer out_buf, size_t count, @@ -36,13 +38,14 @@ class sycl_copy_entry : public sched_entry { const ccl_stream* stream, size_t offset = 0) : sched_entry(sched), + direction(direction), in_buf(in_buf), out_buf(out_buf), count(count), dtype(dtype), stream(stream), offset(offset), - copier(sycl_copier<direction>(in_buf, out_buf, count, dtype, offset)) {} + copier(sycl_copier(direction, in_buf, out_buf, count, dtype, offset)) {} void start() override { LOG_DEBUG(class_name(), ": in_buf ", in_buf, ", out_buf ", out_buf, ", count ", count); @@ -65,7 +68,9 @@ class sycl_copy_entry : public sched_entry { protected: void dump_detail(std::stringstream& str) const override { ccl_logger::format(str, - " dtype ", + "direction ", + to_string(direction), + ", dtype ", ccl::global_data::get().dtypes->name(dtype), ", count ", count, @@ -81,23 +86,14 @@ class sycl_copy_entry : public sched_entry { } private: + copy_direction direction; ccl_buffer in_buf; ccl_buffer out_buf; size_t count; ccl_datatype dtype; const ccl_stream* stream; size_t offset; - sycl_copier<direction> copier; + sycl_copier copier; }; -template <> -constexpr const char* sycl_copy_entry<sycl_copy_direction::d2h>::class_name() noexcept { - return "SYCL_COPY_D2H"; -} - -template <> -constexpr const char* sycl_copy_entry<sycl_copy_direction::h2d>::class_name() noexcept { - return "SYCL_COPY_H2D"; -} - #endif /* CCL_ENABLE_SYCL */ diff --git a/src/sched/entry/copy_entry.hpp b/src/sched/entry/copy_entry.hpp deleted file mode 100644 index 68508e08d..000000000 --- a/src/sched/entry/copy_entry.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - Copyright 2016-2020 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#pragma once - -#include "sched/entry/entry.hpp" - -class copy_entry : public sched_entry, - public postponed_fields<copy_entry, - ccl_sched_entry_field_in_buf, - ccl_sched_entry_field_cnt, - ccl_sched_entry_field_dtype> { -public: - static constexpr const char* class_name() noexcept { - return "COPY"; - } - - copy_entry() = delete; - copy_entry(ccl_sched* sched, - const ccl_buffer in_buf, - ccl_buffer out_buf, - size_t cnt, - const ccl_datatype& dtype) - : sched_entry(sched), - in_buf(in_buf), - out_buf(out_buf), - cnt(cnt), - dtype(dtype) {} - - void start() override { - update_fields(); - - size_t bytes = cnt * dtype.size(); - auto comp_status = ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), cnt, dtype); - CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status); - status = ccl_sched_entry_status_complete; - } - - const char* name() const override { - return class_name(); - } - - ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_in_buf> id) { - return in_buf; - } - - size_t& get_field_ref(field_id_t<ccl_sched_entry_field_cnt> id) { - return cnt; - } - - ccl_datatype& get_field_ref(field_id_t<ccl_sched_entry_field_dtype> id) { - return dtype; - } - -protected: - void dump_detail(std::stringstream& str) const override { - ccl_logger::format(str, - "dt ", - ccl::global_data::get().dtypes->name(dtype), - ", cnt ", - cnt, - ", in_buf ", - in_buf, - ", out_buf ", - out_buf, - "\n"); - } - -private: - ccl_buffer in_buf; - ccl_buffer out_buf; - size_t cnt; - ccl_datatype dtype; -}; diff --git a/src/sched/entry/deps_entry.hpp b/src/sched/entry/deps_entry.hpp new file mode 100644 index 000000000..81464cdac --- /dev/null +++ b/src/sched/entry/deps_entry.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "sched/entry/entry.hpp" + +class deps_entry : public sched_entry { +public: + static constexpr const char* class_name() noexcept { + return "DEPS"; + } + + deps_entry() = delete; + deps_entry(ccl_sched* sched) : sched_entry(sched) {} + + void start() override { + std::vector<ccl::event>& deps = sched->get_deps(); + for (size_t idx = 0; idx < deps.size(); idx++) { +#ifdef CCL_ENABLE_SYCL + /* TODO: detect pure sycl::event and ccl::event for device op */ + deps[idx].get_native().wait(); +#else /* CCL_ENABLE_SYCL */ + deps[idx].wait(); +#endif /* CCL_ENABLE_SYCL */ + } + status = ccl_sched_entry_status_complete; + } + + const char* name() const override { + return class_name(); + } + +protected: + void dump_detail(std::stringstream& str) const override { + ccl_logger::format(str, "deps.size ", sched->get_deps().size(), "\n"); + } +}; diff --git a/src/sched/entry/entry.cpp b/src/sched/entry/entry.cpp index 112a6f4f2..320838dc4 100644 --- a/src/sched/entry/entry.cpp +++ b/src/sched/entry/entry.cpp @@ -14,16 +14,36 @@ limitations under the License. */ #include "sched/entry/entry.hpp" +#include "sched/sched.hpp" #include "common/log/log.hpp" void sched_entry::do_progress() { + if (is_completed()) + return; + + // TODO: fix this tempropary workaround + // For l0 entry take_credit & return_credit isn't needed + // That's why we'd skip it + bool is_l0_entry = false; + const char* name_entry = this->name(); + + // in case if entry is empty name or its length = 1 + if (strlen(name_entry) >= 2) + is_l0_entry = name_entry[0] == 'L' && name_entry[1] == '0'; + if (status < ccl_sched_entry_status_started) { CCL_ASSERT( status == ccl_sched_entry_status_not_started || status == ccl_sched_entry_status_again, "bad status ", status); - start(); - CCL_ASSERT(status >= ccl_sched_entry_status_again, "bad status ", status); + + if (is_l0_entry || sched->flow_control.take_credit()) { + start(); + CCL_ASSERT(status >= ccl_sched_entry_status_again, "bad status ", status); + } + else { + status = ccl_sched_entry_status_again; + } } else if (status == ccl_sched_entry_status_started) { LOG_TRACE("update entry ", name()); @@ -31,9 +51,15 @@ void sched_entry::do_progress() { CCL_ASSERT(status >= ccl_sched_entry_status_started, "bad status ", status); } + if (status == ccl_sched_entry_status_complete && !is_l0_entry) { + sched->flow_control.return_credit(); + } + if (status == ccl_sched_entry_status_complete && exec_mode == ccl_sched_entry_exec_once) { status = ccl_sched_entry_status_complete_once; } + + // TODO: what if status is ccl_sched_entry_status_failed or ccl_sched_entry_status_invalid? } bool sched_entry::is_completed() { diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp index 24fb2ec38..f816bd59a 100644 --- a/src/sched/entry/entry.hpp +++ b/src/sched/entry/entry.hpp @@ -20,6 +20,7 @@ #include "common/utils/utils.hpp" #include "sched/entry/postponed_fields.hpp" #include "internal_types.hpp" + #include <chrono> #include <memory> diff --git a/src/sched/entry/factory/entry_factory.hpp b/src/sched/entry/factory/entry_factory.hpp index 2d32e3b0c..83a3ab33e 100644 --- a/src/sched/entry/factory/entry_factory.hpp +++ b/src/sched/entry/factory/entry_factory.hpp @@ -15,24 +15,6 @@ */ #pragma once -#include "sched/entry/factory/entry_factory.h" - -#include "sched/entry/send_entry.hpp" -#include "sched/entry/recv_entry.hpp" -#include "sched/entry/write_entry.hpp" -#include "sched/entry/reduce_local_entry.hpp" -#include "sched/entry/recv_reduce_entry.hpp" -#include "sched/entry/copy_entry.hpp" -#include "sched/entry/sync_entry.hpp" -#include "sched/entry/prologue_entry.hpp" -#include "sched/entry/epilogue_entry.hpp" -#include "sched/entry/sparse_allreduce_completion_entry.hpp" -#include "sched/entry/wait_value_entry.hpp" -#include "sched/entry/function_entry.hpp" -#include "sched/entry/probe_entry.hpp" -#include "sched/entry/register_entry.hpp" -#include "sched/entry/deregister_entry.hpp" -#include "sched/entry/subsched_entry.hpp" #include "sched/entry/coll/coll_entry.hpp" #include "sched/entry/coll/direct/allgatherv_entry.hpp" #include "sched/entry/coll/direct/allreduce_entry.hpp" @@ -43,9 +25,28 @@ #include "sched/entry/coll/direct/reduce_entry.hpp" #include "sched/entry/coll/direct/reduce_scatter_entry.hpp" +#include "sched/entry/factory/entry_factory.h" + +#include "sched/entry/copy/copy_entry.hpp" #ifdef CCL_ENABLE_SYCL -#include "sched/entry/sycl_copy_entry.hpp" +#include "sched/entry/copy/sycl_copy_entry.hpp" #endif /* CCL_ENABLE_SYCL */ +#include "sched/entry/deps_entry.hpp" +#include "sched/entry/deregister_entry.hpp" +#include "sched/entry/epilogue_entry.hpp" +#include "sched/entry/function_entry.hpp" +#include "sched/entry/probe_entry.hpp" +#include "sched/entry/prologue_entry.hpp" +#include "sched/entry/recv_entry.hpp" +#include "sched/entry/recv_reduce_entry.hpp" +#include "sched/entry/reduce_local_entry.hpp" +#include "sched/entry/register_entry.hpp" +#include "sched/entry/send_entry.hpp" +#include "sched/entry/sparse_allreduce_completion_entry.hpp" +#include "sched/entry/subsched_entry.hpp" +#include "sched/entry/sync_entry.hpp" +#include "sched/entry/wait_value_entry.hpp" +#include "sched/entry/write_entry.hpp" #include "sched/sched.hpp" diff --git a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp index be11e5cd1..805464937 100644 --- a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp +++ b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp @@ -22,9 +22,8 @@ //TODO L0 Workaround namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_allgatherv_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_allgatherv> { @@ -32,8 +31,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, + using base = base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_allgatherv>; @@ -45,17 +43,17 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, using base::kernel_router; using base::get_ctx; using base::get_local_kernel; - using kernel_main_typed = ring_allgatherv_kernel<kernel_params>; - using kernel_ipc_typed = ring_allgatherv_ipc<kernel_params>; + using kernel_main_typed = ring::allgatherv::main_kernel; + using processing_type = void; using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; + typename std::remove_pointer<typename ring::allgatherv::income_data_flag_arg_type>::type; using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; + typename std::remove_pointer<typename ring::allgatherv::ready_to_recv_flag_arg_type>::type; using recv_counts_typed_entry_type = typename std::remove_pointer< - typename kernel_main_typed::recv_elem_counts_buf_arg_type>::type; + typename ring::allgatherv::recv_elem_counts_buf_arg_type>::type; using recv_offsets_typed_entry_type = typename std::remove_pointer< - typename kernel_main_typed::recv_elem_offsets_buf_arg_type>::type; + typename ring::allgatherv::recv_elem_offsets_buf_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_ALLGATHERV_TYPED"; @@ -75,28 +73,24 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, size_t send_count, ccl_buffer recv_buf, const size_t* recv_counts, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - send_buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), + : base(sched, comm, in_ctx, send_buf, params, device_stream), // left_wrote_to_me_flag income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, + typename ring::allgatherv::income_data_flag_arg{}, parent_communicator, 1, get_ctx())), // ready_to_recv_flag_arg ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, + typename ring::allgatherv::ready_to_recv_flag_arg{}, parent_communicator, 1, get_ctx())), recv_counts_buf(parent_communicator->get_device() .template alloc_memory<recv_counts_typed_entry_type>( - send_count, + comm_addr.size, sizeof(recv_counts_typed_entry_type), get_ctx())), @@ -126,8 +120,8 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_allgatherv_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_allgatherv_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -148,15 +142,15 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, auto& main_entry_function = get_local_kernel(); - auto recv_buf_ptr = - reinterpret_cast<typename kernel_params::native_type*>(recv_buf_entry.get_ptr()); + auto recv_buf_ptr = reinterpret_cast<processing_type*>(recv_buf_entry.get_ptr()); + //create implementation specified primitives main_entry_function - .template set_args<typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::recv_buf_arg, - typename kernel_main_typed::recv_elem_counts_buf_arg, - typename kernel_main_typed::recv_elem_offsets_buf_arg, + .template set_args<typename ring::allgatherv::income_data_flag_arg, + typename ring::allgatherv::ready_to_recv_flag_arg, + typename ring::allgatherv::recv_buf_arg<processing_type>, + typename ring::allgatherv::recv_elem_counts_buf_arg, + typename ring::allgatherv::recv_elem_offsets_buf_arg, typename kernel_main_typed::common_entry_buf_size_arg>( income_data_flag.get(), ready_to_recv_flag.get(), @@ -181,9 +175,11 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO + auto recv_buf_ptr = reinterpret_cast<processing_type*>(recv_buf_entry.get_ptr()); + std::vector<ccl_device::device_ipc_memory_handle> ret; ret.reserve(3); + ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx())); return ret; @@ -204,110 +200,64 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params, std::shared_ptr<ccl_context> ctx; public: - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::recv_buf_arg::return_t right_output_buf = - right_kernel.template get_arg<typename kernel_main_typed::recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); - - ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name()); - ENTRY_LOG_TRACE("Args: \n{ ", - right_output_buf.first, - ", ", - right_output_buf.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_output_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_output_buf.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); + right_kernel + .template test_args<typename ring::allgatherv::recv_buf_arg<processing_type>, + typename ring::allgatherv::income_data_flag_arg, + typename ring::allgatherv::ready_to_recv_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. - ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args< //typename kernel_ipc_typed::right_output_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::right_output_buf_arg::return_t right_output_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::right_output_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_output_buf_arg.first, - ", ", - right_output_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - main_entry_function - .template set_args< //typename kernel_main_typed::right_output_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - //right_output_buf_arg.second, + auto right_recv_buf_arg = + right_kernel + .template get_arg<typename ring::allgatherv::recv_buf_arg<processing_type>>(); + auto right_income_data_flag_arg = + right_kernel.template get_arg<typename ring::allgatherv::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel.template get_arg<typename ring::allgatherv::ready_to_recv_flag_arg>(); + + // ENTRY_LOG_DEBUG("Bind right arguments from ", + // right_kernel_t::name(), + // " kernel", + // " to ", + // left_kernel_t::name(), + // " kernel. " + // "Right arguments:\n{ ", + // right_recv_buf_arg.first, + // ", ", + // right_recv_buf_arg.second, + // "}\n", + // "{ ", + // right_income_data_flag_arg.first, + // ", ", + // right_income_data_flag_arg.second, + // "}\n", + // "{ ", + // right_ready_to_recv_flag_arg.first, + // ", ", + // right_ready_to_recv_flag_arg.second, + // "}\n"); + + left_kernel + .template set_args<typename ring::allgatherv::right_output_buf_arg<processing_type>, + typename ring::allgatherv::right_income_data_flag_arg, + typename ring::allgatherv::right_ready_to_recv_flag_arg>( + right_recv_buf_arg.second, right_income_data_flag_arg.second, right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_output_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); + + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); } return is_right_kernel_ready; } diff --git a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp index e03184354..f0a1395d2 100644 --- a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp +++ b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp @@ -19,12 +19,12 @@ #include <atomic> #include "sched/entry/l0/l0_entry.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp" +#include "kernels/shared.h" namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_allreduce_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_allreduce> { @@ -32,8 +32,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, + using base = base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_allreduce>; @@ -46,16 +45,14 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, using base::get_ctx; using base::alloc_memory_wrap; using base::get_local_kernel; - using kernel_main_typed = ring_allreduce_kernel<kernel_params>; - using kernel_ipc_typed = ring_allreduce_ipc<kernel_params>; - using kernel_main_numa_typed = ring_allreduce_numa_kernel<kernel_params>; + using kernel_main_typed = ring::allreduce::main_kernel; using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; + typename std::remove_pointer<typename ring::allreduce::income_data_flag_arg_type>::type; using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; + typename std::remove_pointer<typename ring::allreduce::ready_to_recv_flag_arg_type>::type; using local_barrier_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type; + typename std::remove_pointer<typename ring::allreduce::local_barrier_flag_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_ALLREDUCE_TYPED"; @@ -74,27 +71,24 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, const ccl_buffer send_buf, ccl_buffer recv_buf, size_t cnt, - ccl::reduction op, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - send_buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), - - temp_buffer( - this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{}, - parent_communicator, - cnt, - get_ctx())), - income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, + : base(sched, comm, in_ctx, send_buf, params, device_stream), + + temp_buffer(this->template alloc_memory_wrap( + typename ring::allreduce::tmp_recv_buf_arg<int8_t>{}, parent_communicator, - 1, + ring_allreduce_get_tmp_buffer_size( + ccl::get_datatype_size(params.get_datatype()) * cnt, + base::comm_addr.size), get_ctx())), + income_data_flag( + this->template alloc_memory_wrap(typename ring::allreduce::income_data_flag_arg{}, + parent_communicator, + 1, + get_ctx())), ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, + typename ring::allreduce::ready_to_recv_flag_arg{}, parent_communicator, 1, get_ctx())), @@ -104,13 +98,12 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, sizeof(local_barrier_flag_gpu_type), get_ctx())) { recv_buf_typed_entry = recv_buf; - op_typed_entry = op; cnt_entry = cnt; int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_allreduce_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_allreduce_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -131,16 +124,15 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, auto& main_entry_function = get_local_kernel(); // TODO: try to remove indirect buffer - auto recv_buf_ptr = - reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr()); + void* recv_buf_ptr = recv_buf_typed_entry.get_ptr(); //create implementation specified primitives main_entry_function - .template set_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::local_barrier_flag_arg, - typename kernel_main_typed::recv_buf_arg, + .template set_args<typename ring::allreduce::tmp_recv_buf_arg<void>, + typename ring::allreduce::income_data_flag_arg, + typename ring::allreduce::ready_to_recv_flag_arg, + typename ring::allreduce::local_barrier_flag_arg, + typename ring::allreduce::recv_buf_arg<void>, typename kernel_main_typed::common_entry_buf_size_arg>( temp_buffer.get(), income_data_flag.get(), @@ -165,7 +157,6 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO std::vector<ccl_device::device_ipc_memory_handle> ret; ret.reserve(3); ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx())); @@ -174,16 +165,17 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, return ret; } - observer::invoke_params<type(), kernel_params> get_numa_data() override { + observer::invoke_params<type()> get_numa_data() override { observer::producer_description in_params{ - .world_rank = comm_addr.rank, //TODO unused - .world_size = comm_addr.size, //TODO unused + .rank = comm_addr.rank, //TODO unused + .comm_size = comm_addr.size, //TODO unused .staged_buffer_elem_count = cnt_entry, .context = get_ctx(), .device = parent_communicator->get_device(), .immediate_list = parent_communicator->get_device().create_immediate_cmd_list(get_ctx()) }; - return observer::invoke_params<type(), kernel_params>{ std::move(in_params) }; + // TODO: Should get_params() be a part of in_params? + return observer::invoke_params<type()>(std::move(in_params), base::get_params()); } protected: @@ -192,97 +184,44 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, } private: - ccl_device::device_memory<typename kernel_params::native_type> temp_buffer; + ccl_device::device_memory<int8_t> temp_buffer; ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag; ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag; ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag; - ccl::reduction op_typed_entry; ccl_buffer recv_buf_typed_entry; size_t cnt_entry; public: - template <class left_kernel_main_typed, class right_kernel_main_typed> - bool execute(left_kernel_main_typed& main_entry_function, - right_kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel - .template test_args<typename right_kernel_main_typed::tmp_recv_buf_arg, - typename right_kernel_main_typed::income_data_flag_arg, - typename right_kernel_main_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - // Once we're sure that the parameters ready read them from the right kernel - // Note: we not only read the parameters but also reset their 'ready' flag - // (since we're using a dedicated policy) meaning that they must be stored in order - // to be read again. - // This is a protection to a case of multiple kernel launches(i.e. the collective is ran multiple times) - // where we might read not up-to-date values from the previous run. - //TODO do not get arguments sequencially - use array version instead - typename right_kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename right_kernel_main_typed::tmp_recv_buf_arg>(); - typename right_kernel_main_typed::income_data_flag_arg::return_t - right_income_data_flag_arg = - right_kernel - .template get_arg<typename right_kernel_main_typed::income_data_flag_arg>(); - typename right_kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = right_kernel.template get_arg< - typename right_kernel_main_typed::ready_to_recv_flag_arg>(); - - ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", left_kernel_main_typed::name()); - ENTRY_LOG_DEBUG("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename left_kernel_main_typed::right_tmp_recv_buf_arg, - typename left_kernel_main_typed::right_income_data_flag_arg, - typename left_kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string()); - } - return is_right_kernel_ready; - } + right_kernel.template test_args<typename ring::allreduce::tmp_recv_buf_arg<void>, + typename ring::allreduce::income_data_flag_arg, + typename ring::allreduce::ready_to_recv_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg>(); if (is_right_kernel_ready) { - // Once we're sure that the parameters ready read them from the right kernel - // Note: we not only read the parameters but also reset their 'ready' flag - // (since we're using a dedicated policy) meaning that they must be stored in order - // to be read again. - // This is a protection to a case of multiple kernel launches(i.e. the collective is ran multiple times) - // where we might read not up-to-date values from the previous run. - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); - - ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name()); - ENTRY_LOG_DEBUG("Args: \n{ ", + auto right_tmp_recv_buf_arg = + right_kernel.template get_arg<typename ring::allreduce::tmp_recv_buf_arg<void>>(); + auto right_income_data_flag_arg = + right_kernel.template get_arg<typename ring::allreduce::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel.template get_arg<typename ring::allreduce::ready_to_recv_flag_arg>(); + + /*ENTRY_LOG_DEBUG("Bind right arguments from ", + right_kernel_t::name(), + " kernel", + " to ", + left_kernel_t::name(), + " kernel. " + "Right arguments:\n{ ", right_tmp_recv_buf_arg.first, ", ", right_tmp_recv_buf_arg.second, @@ -296,80 +235,18 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params, right_ready_to_recv_flag_arg.first, ", ", right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); + "}\n");*/ + + left_kernel.template set_args<typename ring::allreduce::right_tmp_recv_buf_arg<void>, + typename ring::allreduce::right_income_data_flag_arg, + typename ring::allreduce::right_ready_to_recv_flag_arg>( + right_tmp_recv_buf_arg.second, + right_income_data_flag_arg.second, + right_ready_to_recv_flag_arg.second); + + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); } return is_right_kernel_ready; } diff --git a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp index 0b940cbf5..8970e06a8 100644 --- a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp +++ b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp @@ -22,9 +22,8 @@ //TODO L0 Workaround namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_alltoallv_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_alltoallv> { @@ -32,8 +31,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, + using base = base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_alltoallv>; @@ -45,26 +43,25 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, using base::kernel_router; using base::get_ctx; using base::get_local_kernel; - using kernel_main_typed = ring_alltoallv_kernel<kernel_params>; - using kernel_ipc_typed = ring_alltoallv_ipc<kernel_params>; + using kernel_main_typed = ring::alltoallv::main_kernel; using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; + typename std::remove_pointer<typename ring::alltoallv::income_data_flag_arg_type>::type; using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; + typename std::remove_pointer<typename ring::alltoallv::ready_to_recv_flag_arg_type>::type; - using recv_counts_typed_entry_type = typename std::remove_pointer< - typename kernel_main_typed::recv_elem_counts_buf_arg_type>::type; + using recv_counts_typed_entry_type = + typename std::remove_pointer<typename ring::alltoallv::recv_elem_counts_buf_arg_type>::type; using recv_offsets_typed_entry_type = typename std::remove_pointer< - typename kernel_main_typed::recv_elem_offsets_buf_arg_type>::type; + typename ring::alltoallv::recv_elem_offsets_buf_arg_type>::type; using proxy_size_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::proxy_size_flag_arg_type>::type; + typename std::remove_pointer<typename ring::alltoallv::proxy_size_flag_arg_type>::type; using send_counts_typed_entry_type = - typename std::remove_pointer<typename kernel_main_typed::send_buf_size_arg_type>::type; + typename std::remove_pointer<typename ring::alltoallv::send_buf_size_arg_type>::type; using send_offsets_typed_entry_type = typename std::remove_pointer< - typename kernel_main_typed::send_elem_offsets_buf_arg_type>::type; + typename ring::alltoallv::send_elem_offsets_buf_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_ALLTOALLV_TYPED"; @@ -82,40 +79,38 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, ccl_driver_context_ptr in_ctx, const ccl_buffer send_buf, const size_t* send_counts, + size_t total_send_counts, ccl_buffer recv_buf, const size_t* recv_counts, + size_t total_recv_counts, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - send_buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), - temp_buffer( - this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{}, - parent_communicator, - 512, - get_ctx())), - // left_wrote_to_me_flag - income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, + : base(sched, comm, in_ctx, send_buf, params, device_stream), + temp_buffer(this->template alloc_memory_wrap( + typename ring::alltoallv::tmp_recv_buf_arg<uint8_t>{}, parent_communicator, - 1, + total_recv_counts, get_ctx())), + // left_wrote_to_me_flag + income_data_flag( + this->template alloc_memory_wrap(typename ring::alltoallv::income_data_flag_arg{}, + parent_communicator, + 1, + get_ctx())), // ready_to_recv_flag_arg ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, - parent_communicator, - 1, - get_ctx())), - proxy_size_flag_entry(this->template alloc_memory_wrap( - typename kernel_main_typed::proxy_size_flag_arg{}, + typename ring::alltoallv::ready_to_recv_flag_arg{}, parent_communicator, 1, get_ctx())), + proxy_size_flag_entry( + this->template alloc_memory_wrap(typename ring::alltoallv::proxy_size_flag_arg{}, + parent_communicator, + 1, + get_ctx())), recv_counts_buf(parent_communicator->get_device() .template alloc_memory<recv_counts_typed_entry_type>( - 512, + total_recv_counts, sizeof(recv_counts_typed_entry_type), get_ctx())), recv_offsets_buf(parent_communicator->get_device() @@ -125,16 +120,14 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, get_ctx())), send_counts_buf(parent_communicator->get_device() .template alloc_memory<recv_counts_typed_entry_type>( - 512, + total_send_counts, sizeof(recv_counts_typed_entry_type), get_ctx())), send_offsets_buf(parent_communicator->get_device() .template alloc_memory<send_offsets_typed_entry_type>( comm_addr.size, sizeof(send_offsets_typed_entry_type), - get_ctx())) - - { + get_ctx())) { // copy recv_buf into recv_buf_entry recv_buf_entry = recv_buf; @@ -165,8 +158,8 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_alltoallv_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_alltoallv_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -188,18 +181,17 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, auto& main_entry_function = get_local_kernel(); - auto recv_buf_ptr = - reinterpret_cast<typename kernel_params::native_type*>(recv_buf_entry.get_ptr()); - // auto send_counts_ptr = reinterpret_cast<size_t*>(send_counts_entry.get_ptr()); + auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_entry.get_ptr()); + //create implementation specified primitives - main_entry_function.template set_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::recv_buf_arg, - typename kernel_main_typed::recv_elem_counts_buf_arg, - typename kernel_main_typed::recv_elem_offsets_buf_arg, - typename kernel_main_typed::proxy_size_flag_arg, - typename kernel_main_typed::send_buf_size_arg>( + main_entry_function.template set_args<typename ring::alltoallv::tmp_recv_buf_arg<void>, + typename ring::alltoallv::income_data_flag_arg, + typename ring::alltoallv::ready_to_recv_flag_arg, + typename ring::alltoallv::recv_buf_arg<void>, + typename ring::alltoallv::recv_elem_counts_buf_arg, + typename ring::alltoallv::recv_elem_offsets_buf_arg, + typename ring::alltoallv::proxy_size_flag_arg, + typename ring::alltoallv::send_buf_size_arg>( temp_buffer.get(), income_data_flag.get(), ready_to_recv_flag.get(), @@ -225,11 +217,14 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO std::vector<ccl_device::device_ipc_memory_handle> ret; - ret.reserve(3); + ret.reserve(4); + ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx())); + ret.push_back( + owned_device.create_ipc_memory_handle(proxy_size_flag_entry.get(), get_ctx())); + return ret; } @@ -239,7 +234,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, } private: - ccl_device::device_memory<typename kernel_params::native_type> temp_buffer; + ccl_device::device_memory<uint8_t> temp_buffer; ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag; ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag; ccl_device::device_memory<proxy_size_flag_gpu_type> proxy_size_flag_entry; @@ -251,119 +246,72 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params, std::shared_ptr<ccl_context> ctx; public: - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::proxy_size_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); - - typename kernel_main_typed::proxy_size_flag_arg::return_t right_proxy_size_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::proxy_size_flag_arg>(); - - ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name()); - ENTRY_LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg, - typename kernel_main_typed::right_proxy_size_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second, - right_proxy_size_flag_arg.second); - ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - bool is_right_kernel_ready = false; - /* TODO UNSUPPORTED + right_kernel.template test_args<typename ring::alltoallv::tmp_recv_buf_arg<void>, + typename ring::alltoallv::income_data_flag_arg, + typename ring::alltoallv::ready_to_recv_flag_arg, + typename ring::alltoallv::proxy_size_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args< //typename kernel_ipc_typed::right_output_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::right_output_buf_arg::return_t right_output_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::right_output_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_output_buf_arg.first, - ", ", - right_output_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - main_entry_function - .template set_args< //typename kernel_main_typed::right_output_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - //right_output_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_output_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); - }*/ + auto right_tmp_recv_buf_arg = + right_kernel.template get_arg<typename ring::alltoallv::tmp_recv_buf_arg<void>>(); + auto right_income_data_flag_arg = + right_kernel.template get_arg<typename ring::alltoallv::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel.template get_arg<typename ring::alltoallv::ready_to_recv_flag_arg>(); + auto right_proxy_size_flag_arg = + right_kernel.template get_arg<typename ring::alltoallv::proxy_size_flag_arg>(); + + // ENTRY_LOG_DEBUG("Bind right arguments from ", + // right_kernel_t::name(), + // " kernel", + // " to ", + // left_kernel_t::name(), + // " kernel. " + // "Right arguments:\n{ ", + // right_tmp_recv_buf_arg.first, + // ", ", + // right_tmp_recv_buf_arg.second, + // "}\n", + // "{ ", + // right_income_data_flag_arg.first, + // ", ", + // right_income_data_flag_arg.second, + // "}\n", + // "{ ", + // right_ready_to_recv_flag_arg.first, + // ", ", + // right_ready_to_recv_flag_arg.second, + // "}\n", + // "{ ", + // right_proxy_size_flag_arg.first, + // ", ", + // right_proxy_size_flag_arg.second, + // "}\n"); + + left_kernel.template set_args<typename ring::alltoallv::right_tmp_recv_buf_arg<void>, + typename ring::alltoallv::right_income_data_flag_arg, + typename ring::alltoallv::right_ready_to_recv_flag_arg, + typename ring::alltoallv::right_proxy_size_flag_arg>( + right_tmp_recv_buf_arg.second, + right_income_data_flag_arg.second, + right_ready_to_recv_flag_arg.second, + right_proxy_size_flag_arg.second); + + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); + } return is_right_kernel_ready; } }; diff --git a/src/sched/entry/l0/l0_bcast_typed_entry.hpp b/src/sched/entry/l0/l0_bcast_typed_entry.hpp index b40141fe1..c40b7ddd3 100644 --- a/src/sched/entry/l0/l0_bcast_typed_entry.hpp +++ b/src/sched/entry/l0/l0_bcast_typed_entry.hpp @@ -21,9 +21,8 @@ //TODO L0 Workaround namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_bcast_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_bcast> { @@ -31,11 +30,8 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, - topology, - ccl::device_topology_type::ring, - ccl_coll_bcast>; + using base = + base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_bcast>; using base::parent_communicator; using base::comm_addr; using base::req; @@ -44,15 +40,15 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, using base::kernel_router; using base::get_ctx; using base::get_local_kernel; - using kernel_main_typed = ring_bcast_kernel<kernel_params>; - using kernel_ipc_typed = ring_bcast_ipc<kernel_params>; + using kernel_main_typed = ring::bcast::main_kernel; + using processing_type = void; using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; + typename std::remove_pointer<typename ring::bcast::income_data_flag_arg_type>::type; using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; + typename std::remove_pointer<typename ring::bcast::ready_to_recv_flag_arg_type>::type; using local_barrier_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type; + typename std::remove_pointer<typename ring::bcast::local_barrier_flag_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_BCAST_TYPED"; @@ -70,24 +66,20 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, ccl_buffer buf, size_t cnt, int root, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), - - income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, - parent_communicator, - 1, - get_ctx())), - ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, - parent_communicator, - 1, - get_ctx())), + : base(sched, comm, in_ctx, buf, params, device_stream), + + income_data_flag( + this->template alloc_memory_wrap(typename ring::bcast::income_data_flag_arg{}, + parent_communicator, + 1, + get_ctx())), + ready_to_recv_flag( + this->template alloc_memory_wrap(typename ring::bcast::ready_to_recv_flag_arg{}, + parent_communicator, + 1, + get_ctx())), local_barrier_flag(parent_communicator->get_device() .template alloc_memory<local_barrier_flag_gpu_type>( 1, @@ -98,8 +90,8 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_bcast_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_bcast_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -123,10 +115,10 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, //create implementation specified primitives main_entry_function - .template set_args<typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::local_barrier_flag_arg, - typename kernel_main_typed::root_arg, + .template set_args<typename ring::bcast::income_data_flag_arg, + typename ring::bcast::ready_to_recv_flag_arg, + typename ring::bcast::local_barrier_flag_arg, + typename ring::bcast::root_arg, typename kernel_main_typed::common_entry_buf_size_arg>( income_data_flag.get(), ready_to_recv_flag.get(), @@ -150,9 +142,11 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO + auto recv_buf_ptr = reinterpret_cast<processing_type*>(base::send_buf.get_ptr()); + std::vector<ccl_device::device_ipc_memory_handle> ret; - ret.reserve(2); + ret.reserve(3); + ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx())); return ret; @@ -172,109 +166,61 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params, std::shared_ptr<ccl_context> ctx; public: - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::common_entry_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::common_entry_buf_arg::return_t right_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::common_entry_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); + right_kernel.template test_args<typename ring::bcast::buf_arg<processing_type>, + typename ring::bcast::income_data_flag_arg, + typename ring::bcast::ready_to_recv_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. - ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name()); - ENTRY_LOG_TRACE("Args: \n{ ", - right_buf_arg.first, - ", ", - right_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_ipc_typed::recv_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::common_entry_buf_arg::return_t right_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_buf_arg.first, - ", ", - right_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - main_entry_function - .template set_args<typename kernel_main_typed::right_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); + auto right_buf_arg = + right_kernel.template get_arg<typename ring::bcast::buf_arg<processing_type>>(); + auto right_income_data_flag_arg = + right_kernel.template get_arg<typename ring::bcast::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel.template get_arg<typename ring::bcast::ready_to_recv_flag_arg>(); + + // ENTRY_LOG_DEBUG("Bind right arguments from ", + // right_kernel_t::name(), + // " kernel", + // " to ", + // left_kernel_t::name(), + // " kernel. " + // "Right arguments:\n{ ", + // right_buf_arg.first, + // ", ", + // right_buf_arg.second, + // "}\n", + // "{ ", + // right_income_data_flag_arg.first, + // ", ", + // right_income_data_flag_arg.second, + // "}\n", + // "{ ", + // right_ready_to_recv_flag_arg.first, + // ", ", + // right_ready_to_recv_flag_arg.second, + // "}\n"); + + left_kernel.template set_args<typename ring::bcast::right_buf_arg<processing_type>, + typename ring::bcast::right_income_data_flag_arg, + typename ring::bcast::right_ready_to_recv_flag_arg>( + right_buf_arg.second, + right_income_data_flag_arg.second, + right_ready_to_recv_flag_arg.second); + + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); } return is_right_kernel_ready; } diff --git a/src/sched/entry/l0/l0_entry.hpp b/src/sched/entry/l0/l0_entry.hpp index ed2598f11..7bcb1add3 100644 --- a/src/sched/entry/l0/l0_entry.hpp +++ b/src/sched/entry/l0/l0_entry.hpp @@ -30,8 +30,8 @@ #include "common/global/global.hpp" #include "common/stream/stream.hpp" -#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp" -#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp" +#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp" +#include "common/comm/l0/context/scale/base/base_session.hpp" //TODO L0 Workaround #include <unistd.h> @@ -149,19 +149,16 @@ inline std::string to_string(gpu_entry_state state) { } namespace native { -template <class kernel_params, - class gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type group_id, ccl::device_topology_type class_id, ccl_coll_type type_op> class base_gpu_entry : public sched_entry { public: using gpu_comm = gpu_comm_impl; - using processing_type = typename kernel_params::native_type; - using kernel_main_typed = - typename gpu_comm::template gpu_kernel_t<type_op, group_id, class_id, kernel_params>; - using kernel_ipc_typed = typename ccl_ipc_gpu_comm:: - template gpu_kernel_t<type_op, group_id, class_id, kernel_params>; + using kernel_main_typed = typename gpu_comm::template gpu_kernel_t<type_op, group_id, class_id>; + using kernel_ipc_typed = + typename ccl_ipc_gpu_comm::template gpu_kernel_t<type_op, group_id, class_id>; template <class elem_t> using device_memory = memory<elem_t, ccl_device, ccl_context>; @@ -188,14 +185,14 @@ class base_gpu_entry : public sched_entry { std::shared_ptr<gpu_comm> comm, ccl_driver_context_ptr in_ctx, const ccl_buffer send_buf, - ccl::datatype dtype_in, + const coll_param_gpu ¶ms, std::shared_ptr<ccl_stream> &stream) : sched_entry(sched), parent_communicator(comm), comm_addr(parent_communicator ->template get_comm_data<get_topology(), get_topology_class()>()), send_buf(send_buf), - dtype(dtype_in), + params(params), device_stream(stream), ctx(in_ctx), entry_state(gpu_entry_state::initial), @@ -209,10 +206,8 @@ class base_gpu_entry : public sched_entry { } kernel_main_typed &get_local_kernel() noexcept { - return parent_communicator->template get_gpu_kernel<type(), - get_topology(), - get_topology_class(), - kernel_params>(); + return parent_communicator + ->template get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); } virtual ~base_gpu_entry() {} @@ -234,11 +229,11 @@ class base_gpu_entry : public sched_entry { //set kernel args for main kernel on current device kernel_main_typed &main_entry_function = - parent_communicator->template register_entry<kernel_params, group_id, class_id>(*this); + parent_communicator->template register_entry<group_id, class_id>(*this); - auto send_buf_ptr = - reinterpret_cast<typename kernel_params::native_type *>(send_buf.get_ptr()); + auto send_buf_ptr = send_buf.get_ptr(); + //bind data main_entry_function.template set_args<typename kernel_main_typed::common_entry_buf_arg>( send_buf_ptr); @@ -254,6 +249,7 @@ class base_gpu_entry : public sched_entry { virtual void update() override { if (!ready_to_exec) { + // TODO: what if submit_for_execution() return false? submit_for_execution(); } else { @@ -262,7 +258,17 @@ class base_gpu_entry : public sched_entry { ENTRY_LOG_TRACE(" waiting for finished execution, queue: ", cmd_queue.get()); - ze_result_t ret = get_fence_impl().query_status(); + ze_result_t ret; + + // Quering fence doesn't sync kernel output with the host, so if we need this + // we use QuerySyncronize API. + if (ccl::global_data::env().comm_kernels_debug == 0) { + ret = get_fence_impl().query_status(); + } + else { + ret = zeCommandQueueSynchronize(cmd_queue.get(), 0); + } + ENTRY_LOG_TRACE( "Fence query status: ", native::to_string(ret), ", queue: ", cmd_queue.get()); if (ret == ZE_RESULT_SUCCESS) { @@ -300,7 +306,13 @@ class base_gpu_entry : public sched_entry { //USE GPU cache binding virtual std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() = 0; - virtual observer::invoke_params<type(), kernel_params> get_numa_data() { + virtual observer::invoke_params<type()> get_numa_data() { + //TODO make pure-virtual + ENTRY_LOG_ERROR("NOT implemented for that collective type"); + abort(); + } + + virtual observer::invoke_params<type()> get_scaleout_data() { //TODO make pure-virtual ENTRY_LOG_ERROR("NOT implemented for that collective type"); abort(); @@ -314,11 +326,19 @@ class base_gpu_entry : public sched_entry { return native::observer::session_key{ this }; } + virtual native::observer::session_key get_scaleout_session_key() const { + return native::observer::session_key{ this }; + } + + const coll_param_gpu &get_params() const { + return params; + } + protected: size_t get_work_group_size(size_t buffer_size, ccl_device &device) { size_t group_size; size_t val_vector_size; - auto dtype = ccl::native_type_info<typename kernel_params::native_type>::dtype; + auto dtype = params.get_datatype(); if (ccl::global_data::env().gpu_thread_count != CCL_ENV_SIZET_NOT_SPECIFIED) { group_size = ccl::global_data::env().gpu_thread_count; @@ -431,9 +451,12 @@ class base_gpu_entry : public sched_entry { assert(this->get_state() != gpu_entry_state::wait_for_completion); if (get_topology() == ccl::group_split_type::cluster) { - // TODO: implement process communicator case - throw ccl::exception(std::string(__PRETTY_FUNCTION__) + - "TODO: implement process communicator case"); + // TODO: in case of (vitual device + IPC) we can get the data race here + // How we can detect such case? + // In the case when we use one GPU queue per process, everything should be ok + // throw ccl::exception(std::string(__PRETTY_FUNCTION__) + + // "TODO: implement process communicator case"); + cmd_list.close_and_execute(get_ctx(), this->get_fence()); } else { // TODO: how to ensure that fence update is thread safe? @@ -502,7 +525,10 @@ class base_gpu_entry : public sched_entry { std::shared_ptr<gpu_comm> parent_communicator; topology_addr<group_id, class_id> comm_addr; ccl_buffer send_buf; - ccl::datatype dtype; + coll_param_gpu params; + + // TODO: we don't need dtype anymore? + // ccl::datatype dtype; atl_req_t req{}; std::shared_ptr<ccl_stream> device_stream; // GPU @@ -535,7 +561,8 @@ class base_gpu_entry : public sched_entry { static std::unique_ptr<base_connector_interface<kernel_main_typed>> create_kernel_router_for_rank(executor &exec, int next_rank, - specific_indexed_device_storage &group_devices) { + specific_indexed_device_storage &group_devices, + const coll_param_gpu ¶ms) { std::unique_ptr<base_connector_interface<kernel_main_typed>> kernel_router; while (!kernel_router) { //Gather data from in-process GPU @@ -548,10 +575,10 @@ class base_gpu_entry : public sched_entry { std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with real device kernel_router.reset( @@ -570,10 +597,10 @@ class base_gpu_entry : public sched_entry { std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); kernel_router.reset( new kernel_connector<kernel_main_typed, executor, right_kernel_main_type>( exec, right_main_func)); @@ -590,14 +617,14 @@ class base_gpu_entry : public sched_entry { } std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; /*std::shared_ptr<thread_group_comm_device> gpu = map_devices.find(next_rank); if(gpu == nullptr) { break; // not ready yet! }*/ right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with real device from another thread kernel_router.reset( @@ -616,7 +643,7 @@ class base_gpu_entry : public sched_entry { } std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; /* std::shared_ptr<thread_group_comm_device> gpu = map_devices.find(next_rank); if(gpu == nullptr) @@ -624,7 +651,7 @@ class base_gpu_entry : public sched_entry { break; // not ready yet! }*/ right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with virtual device from another thread kernel_router.reset( @@ -643,9 +670,9 @@ class base_gpu_entry : public sched_entry { } std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with real device from another thread kernel_router.reset( @@ -665,9 +692,9 @@ class base_gpu_entry : public sched_entry { std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with virtual device from another thread kernel_router.reset( @@ -686,9 +713,9 @@ class base_gpu_entry : public sched_entry { } std::shared_ptr<right_gpu_type> gpu = it->second; using right_kernel_main_type = typename right_gpu_type:: - template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>; + template gpu_kernel_t<type(), get_topology(), get_topology_class()>; right_kernel_main_type &right_main_func = - gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>(); + gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params); //communicate with virtual device from another thread kernel_router.reset( diff --git a/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp b/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp index 85afe5dde..2e5f1b616 100644 --- a/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp +++ b/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp @@ -19,12 +19,13 @@ #include "sched/entry/l0/l0_entry.hpp" +#include "kernels/shared.h" + //TODO L0 Workaround namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_reduce_scatter_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_reduce_scatter> { @@ -32,8 +33,7 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, + using base = base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_reduce_scatter>; @@ -45,16 +45,14 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, using base::kernel_router; using base::get_ctx; using base::get_local_kernel; + using kernel_main_typed = ring::reduce_scatter::main_kernel; - using kernel_main_typed = ring_reduce_scatter_kernel<kernel_params>; - using kernel_ipc_typed = ring_reduce_scatter_ipc<kernel_params>; - - using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; - using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; - using local_barrier_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type; + using income_data_flag_gpu_type = typename std::remove_pointer< + typename ring::reduce_scatter::income_data_flag_arg_type>::type; + using ready_to_recv_flag_gpu_type = typename std::remove_pointer< + typename ring::reduce_scatter::ready_to_recv_flag_arg_type>::type; + using local_barrier_flag_gpu_type = typename std::remove_pointer< + typename ring::reduce_scatter::local_barrier_flag_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_REDUCE_SCATTER_TYPED"; @@ -73,27 +71,23 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, const ccl_buffer send_buf, ccl_buffer recv_buf, size_t cnt, - ccl::reduction op, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - send_buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), - - temp_buffer( - this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{}, - parent_communicator, - cnt, - get_ctx())), + : base(sched, comm, in_ctx, send_buf, params, device_stream), + + temp_buffer(this->template alloc_memory_wrap( + typename ring::reduce_scatter::tmp_recv_buf_arg<uint8_t>{}, + parent_communicator, + ring_reduce_scatter_tmp_buffer_size(cnt, base::comm_addr.size) * + ccl::get_datatype_size(params.get_datatype()), + get_ctx())), income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, + typename ring::reduce_scatter::income_data_flag_arg{}, parent_communicator, 1, get_ctx())), ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, + typename ring::reduce_scatter::ready_to_recv_flag_arg{}, parent_communicator, 1, get_ctx())), @@ -103,13 +97,12 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, sizeof(local_barrier_flag_gpu_type), get_ctx())) { recv_buf_typed_entry = recv_buf; - op_typed_entry = op; cnt_entry = cnt; int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_reduce_scatter_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_reduce_scatter_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -131,15 +124,15 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, auto& main_entry_function = get_local_kernel(); - auto recv_buf_ptr = - reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr()); + auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr()); + //create implementation specified primitives main_entry_function - .template set_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::local_barrier_flag_arg, - typename kernel_main_typed::recv_buf_arg, + .template set_args<typename ring::reduce_scatter::tmp_recv_buf_arg<void>, + typename ring::reduce_scatter::income_data_flag_arg, + typename ring::reduce_scatter::ready_to_recv_flag_arg, + typename ring::reduce_scatter::local_barrier_flag_arg, + typename ring::reduce_scatter::recv_buf_arg<void>, typename kernel_main_typed::common_entry_buf_size_arg>( temp_buffer.get(), income_data_flag.get(), @@ -164,9 +157,11 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO + auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr()); + std::vector<ccl_device::device_ipc_memory_handle> ret; - ret.reserve(3); + ret.reserve(4); + ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx())); ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx())); @@ -179,137 +174,85 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params, } private: - ccl_device::device_memory<typename kernel_params::native_type> temp_buffer; + ccl_device::device_memory<> temp_buffer; ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag; ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag; ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag; - ccl::reduction op_typed_entry; ccl_buffer recv_buf_typed_entry; size_t cnt_entry; std::shared_ptr<ccl_context> ctx; public: - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::recv_buf_arg, - typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg>(); + right_kernel + .template test_args<typename ring::reduce_scatter::recv_buf_arg<void>, + typename ring::reduce_scatter::tmp_recv_buf_arg<void>, + typename ring::reduce_scatter::income_data_flag_arg, + typename ring::reduce_scatter::ready_to_recv_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. + if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::recv_buf_arg::return_t right_output_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::recv_buf_arg>(); - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_output_buf_arg, - typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( + auto right_output_buf_arg = + right_kernel.template get_arg<typename ring::reduce_scatter::recv_buf_arg<void>>(); + auto right_tmp_recv_buf_arg = + right_kernel + .template get_arg<typename ring::reduce_scatter::tmp_recv_buf_arg<void>>(); + auto right_income_data_flag_arg = + right_kernel + .template get_arg<typename ring::reduce_scatter::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel + .template get_arg<typename ring::reduce_scatter::ready_to_recv_flag_arg>(); + + // ENTRY_LOG_DEBUG("Bind right arguments from ", + // right_kernel_t::name(), + // " kernel", + // " to ", + // left_kernel_t::name(), + // " kernel. " + // "Right arguments:\n{ ", + // right_output_buf_arg.first, + // ", ", + // right_output_buf_arg.second, + // "}\n", + // "{ ", + // right_tmp_recv_buf_arg.first, + // ", ", + // right_tmp_recv_buf_arg.second, + // "}\n", + // "{ ", + // right_income_data_flag_arg.first, + // ", ", + // right_income_data_flag_arg.second, + // "}\n", + // "{ ", + // right_ready_to_recv_flag_arg.first, + // ", ", + // right_ready_to_recv_flag_arg.second, + // "}\n"); + + left_kernel + .template set_args<typename ring::reduce_scatter::right_output_buf_arg<void>, + typename ring::reduce_scatter::right_tmp_recv_buf_arg<void>, + typename ring::reduce_scatter::right_income_data_flag_arg, + typename ring::reduce_scatter::right_ready_to_recv_flag_arg>( right_output_buf_arg.second, right_tmp_recv_buf_arg.second, right_income_data_flag_arg.second, right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); } return is_right_kernel_ready; } diff --git a/src/sched/entry/l0/l0_reduce_typed_entry.hpp b/src/sched/entry/l0/l0_reduce_typed_entry.hpp index a2d96df42..72ea07031 100644 --- a/src/sched/entry/l0/l0_reduce_typed_entry.hpp +++ b/src/sched/entry/l0/l0_reduce_typed_entry.hpp @@ -22,9 +22,8 @@ //TODO L0 Workaround namespace native { -template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology> -class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, - gpu_comm_impl, +template <class gpu_comm_impl, ccl::group_split_type topology> +class l0_reduce_typed_entry : public base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_reduce> { @@ -32,11 +31,8 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, friend class ccl_gpu_comm; friend class ccl_virtual_gpu_comm; - using base = base_gpu_entry<kernel_params, - gpu_comm_impl, - topology, - ccl::device_topology_type::ring, - ccl_coll_reduce>; + using base = + base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_reduce>; using base::parent_communicator; using base::comm_addr; using base::req; @@ -45,15 +41,16 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, using base::kernel_router; using base::get_ctx; using base::get_local_kernel; - using kernel_main_typed = ring_reduce_kernel<kernel_params>; - using kernel_ipc_typed = ring_reduce_ipc<kernel_params>; + using kernel_main_typed = ring::reduce::main_kernel; + // TODO: fix type + using processing_type = uint8_t; using income_data_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type; + typename std::remove_pointer<typename ring::reduce::income_data_flag_arg_type>::type; using ready_to_recv_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type; + typename std::remove_pointer<typename ring::reduce::ready_to_recv_flag_arg_type>::type; using local_barrier_flag_gpu_type = - typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type; + typename std::remove_pointer<typename ring::reduce::local_barrier_flag_arg_type>::type; static constexpr const char* class_name() noexcept { return "L0_REDUCE_TYPED"; @@ -73,43 +70,39 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, size_t cnt, ccl::reduction op, int root, + const coll_param_gpu& params, std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>()) - : base(sched, - comm, - in_ctx, - send_buf, - ccl::native_type_info<typename kernel_params::native_type>::dtype, - device_stream), + : base(sched, comm, in_ctx, send_buf, params, device_stream), - temp_buffer( - this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{}, - parent_communicator, - cnt, - get_ctx())), - income_data_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::income_data_flag_arg{}, + temp_buffer(this->template alloc_memory_wrap( + typename ring::reduce::tmp_recv_buf_arg<uint8_t>{}, parent_communicator, - 1, - get_ctx())), - ready_to_recv_flag(this->template alloc_memory_wrap( - typename kernel_main_typed::ready_to_recv_flag_arg{}, - parent_communicator, - 1, + ring_reduce_tmp_buffer_size(cnt, comm_addr.size) * + ccl::get_datatype_size(params.get_datatype()), get_ctx())), + income_data_flag( + this->template alloc_memory_wrap(typename ring::reduce::income_data_flag_arg{}, + parent_communicator, + 1, + get_ctx())), + ready_to_recv_flag( + this->template alloc_memory_wrap(typename ring::reduce::ready_to_recv_flag_arg{}, + parent_communicator, + 1, + get_ctx())), local_barrier_flag(parent_communicator->get_device() .template alloc_memory<local_barrier_flag_gpu_type>( 1, sizeof(local_barrier_flag_gpu_type), get_ctx())) { recv_buf_typed_entry = recv_buf; - op_typed_entry = op; root_typed_entry = root; cnt_entry = cnt; int next_rank = (comm_addr.rank + 1) % comm_addr.size; kernel_router = base::template create_kernel_router_for_rank< - l0_reduce_typed_entry<kernel_params, gpu_comm_impl, topology>>( - *this, next_rank, available_devices); + l0_reduce_typed_entry<gpu_comm_impl, topology>>( + *this, next_rank, available_devices, base::get_params()); ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank); @@ -131,16 +124,15 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, auto& main_entry_function = get_local_kernel(); - auto recv_buf_ptr = - reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr()); + auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr()); //create implementation specified primitives main_entry_function - .template set_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg, - typename kernel_main_typed::local_barrier_flag_arg, - typename kernel_main_typed::recv_buf_arg, - typename kernel_main_typed::root_arg, + .template set_args<typename ring::reduce::tmp_recv_buf_arg<void>, + typename ring::reduce::income_data_flag_arg, + typename ring::reduce::ready_to_recv_flag_arg, + typename ring::reduce::local_barrier_flag_arg, + typename ring::reduce::recv_buf_arg<void>, + typename ring::reduce::root_arg, typename kernel_main_typed::common_entry_buf_size_arg>( temp_buffer.get(), income_data_flag.get(), @@ -155,6 +147,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, this->set_state(gpu_entry_state::wait_for_entry); //make sure, that kernel ready for launch + // TODO: what if submit_for_execution() return false? this->submit_for_execution(); status = ccl_sched_entry_status_started; } @@ -166,7 +159,6 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override { ccl_device& owned_device = parent_communicator->get_device(); - //TODO std::vector<ccl_device::device_ipc_memory_handle> ret; ret.reserve(3); ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx())); @@ -181,133 +173,72 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params, } private: - ccl_device::device_memory<typename kernel_params::native_type> temp_buffer; + ccl_device::device_memory<> temp_buffer; ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag; ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag; ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag; - ccl::reduction op_typed_entry; ccl_buffer recv_buf_typed_entry; int root_typed_entry; size_t cnt_entry; std::shared_ptr<ccl_context> ctx; public: - bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) { - //Check argument binding in kernels for next rank + template <class left_kernel_t, class right_kernel_t> + bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) { bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg, - typename kernel_main_typed::income_data_flag_arg, - typename kernel_main_typed::ready_to_recv_flag_arg>(); - if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>(); + right_kernel + .template test_args<typename ring::reduce::tmp_recv_buf_arg<processing_type>, + typename ring::reduce::income_data_flag_arg, + typename ring::reduce::ready_to_recv_flag_arg>(); + + // Once we're sure that the parameters ready read them from the right kernel + // Note: we not only read the parameters but also reset their 'ready' flag + // (since we're using a destructive-copying policy) meaning that they must be stored + // in order to be read again. + // This is a protection to a case of multiple kernel launches + // (i.e. the collective is ran multiple times) where we might read not up-to-date + // values from the previous run. - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: use array-version - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); - } - return is_right_kernel_ready; - } - - bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) { - //Check argument binding in kernels for next rank - bool is_right_kernel_ready = - right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg, - typename kernel_ipc_typed::income_data_flag_arg, - typename kernel_ipc_typed::ready_to_recv_flag_arg>(); if (is_right_kernel_ready) { - //TODO do not get arguments sequencially - use array version instead - typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>(); - typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg = - right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>(); - typename kernel_main_typed::ready_to_recv_flag_arg::return_t - right_ready_to_recv_flag_arg = - right_kernel - .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>(); - - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ", bind elapsed arguments for kernel: ", - kernel_main_typed::name()); - LOG_TRACE("Args: \n{ ", - right_tmp_recv_buf_arg.first, - ", ", - right_tmp_recv_buf_arg.second, - "}\n", - "{ ", - right_income_data_flag_arg.first, - ", ", - right_income_data_flag_arg.second, - "}\n", - "{ ", - right_ready_to_recv_flag_arg.first, - ", ", - right_ready_to_recv_flag_arg.second, - "}\n"); - - //TODO register argument for current device kernel: user array version - main_entry_function - .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg, - typename kernel_main_typed::right_income_data_flag_arg, - typename kernel_main_typed::right_ready_to_recv_flag_arg>( - right_tmp_recv_buf_arg.second, - right_income_data_flag_arg.second, - right_ready_to_recv_flag_arg.second); - LOG_TRACE("Set right_tmp_recv_buf_arg", - "Set right_income_data_flag_arg", - "Set right_ready_to_recv_flag_arg"); - LOG_DEBUG("entry: ", - class_name(), - ", rank: ", - comm_addr.to_string(), - ". Function: ", - main_entry_function.to_string()); + auto right_tmp_recv_buf_arg = + right_kernel.template get_arg<typename ring::reduce::tmp_recv_buf_arg<void>>(); + auto right_income_data_flag_arg = + right_kernel.template get_arg<typename ring::reduce::income_data_flag_arg>(); + auto right_ready_to_recv_flag_arg = + right_kernel.template get_arg<typename ring::reduce::ready_to_recv_flag_arg>(); + + // ENTRY_LOG_DEBUG("Bind right arguments from ", + // right_kernel_t::name(), + // " kernel", + // " to ", + // left_kernel_t::name(), + // " kernel. " + // "Right arguments:\n{ ", + // right_tmp_recv_buf_arg.first, + // ", ", + // right_tmp_recv_buf_arg.second, + // "}\n", + // "{ ", + // right_income_data_flag_arg.first, + // ", ", + // right_income_data_flag_arg.second, + // "}\n", + // "{ ", + // right_ready_to_recv_flag_arg.first, + // ", ", + // right_ready_to_recv_flag_arg.second, + // "}\n"); + + left_kernel.template set_args<typename ring::reduce::right_tmp_recv_buf_arg<void>, + typename ring::reduce::right_income_data_flag_arg, + typename ring::reduce::right_ready_to_recv_flag_arg>( + right_tmp_recv_buf_arg.second, + right_income_data_flag_arg.second, + right_ready_to_recv_flag_arg.second); + + ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ", + "Arguments of the left kernel after binding:\n", + left_kernel.to_string()); } return is_right_kernel_ready; } diff --git a/src/sched/entry/probe_entry.hpp b/src/sched/entry/probe_entry.hpp index 43a9367e4..3cbc402a1 100644 --- a/src/sched/entry/probe_entry.hpp +++ b/src/sched/entry/probe_entry.hpp @@ -17,6 +17,7 @@ #include "sched/entry/entry.hpp" #include "sched/sched.hpp" +#include "sched/queue/queue.hpp" class probe_entry : public sched_entry { public: diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp index f93f9c1ab..9c36c633b 100644 --- a/src/sched/entry/recv_reduce_entry.hpp +++ b/src/sched/entry/recv_reduce_entry.hpp @@ -122,7 +122,8 @@ class recv_reduce_entry final : public sched_entry { ccl_buffer reduce_inout_buf = (result_buf_type == ccl_recv_reduce_local_buf) ? inout_buf : comm_buf; - ccl::status comp_status = ccl_comp_reduce(reduce_in_buf.get_ptr(bytes), + ccl::status comp_status = ccl_comp_reduce(sched, + reduce_in_buf.get_ptr(bytes), in_cnt, reduce_inout_buf.get_ptr(bytes), out_cnt, diff --git a/src/sched/entry/reduce_local_entry.hpp b/src/sched/entry/reduce_local_entry.hpp index 0eb133e35..0a7b58a74 100644 --- a/src/sched/entry/reduce_local_entry.hpp +++ b/src/sched/entry/reduce_local_entry.hpp @@ -48,7 +48,8 @@ class reduce_local_entry : public sched_entry { size_t bytes = in_cnt * dtype.size(); size_t offset = inout_buf.get_offset(); const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset }; - ccl::status comp_status = ccl_comp_reduce(in_buf.get_ptr(bytes), + ccl::status comp_status = ccl_comp_reduce(sched, + in_buf.get_ptr(bytes), in_cnt, inout_buf.get_ptr(bytes), out_cnt, diff --git a/src/sched/entry/subsched_entry.hpp b/src/sched/entry/subsched_entry.hpp index dca48e39b..bcfee7697 100644 --- a/src/sched/entry/subsched_entry.hpp +++ b/src/sched/entry/subsched_entry.hpp @@ -41,6 +41,7 @@ class subsched_entry : public sched_entry { subsched.reset(new ccl_extra_sched(sched->coll_param, sched->sched_id)); subsched->coll_param.ctype = ccl_coll_internal; subsched->set_op_id(this->op_id); + subsched->flow_control.set_max_credits(sched->flow_control.get_max_credits()); if (sched->coll_param.ctype == ccl_coll_allreduce || sched->coll_param.ctype == ccl_coll_reduce || diff --git a/src/sched/master_sched.cpp b/src/sched/master_sched.cpp index 7cc0dcefa..61afd234f 100644 --- a/src/sched/master_sched.cpp +++ b/src/sched/master_sched.cpp @@ -104,9 +104,43 @@ void ccl_master_sched::prepare_partial_scheds() { void ccl_master_sched::sync_partial_scheds() { CCL_THROW_IF_NOT(!partial_scheds.empty(), "no partial schedules"); - auto sync_obj = std::make_shared<sync_object>(partial_scheds.size()); + bool add_sync_entry = false; + + /* ensure all partial schedules have the same add_mode */ + ccl_sched_add_mode add_mode = partial_scheds[0]->get_add_mode(); + for (auto& sched : partial_scheds) { + CCL_THROW_IF_NOT(sched->get_add_mode() == add_mode, + "unexpected add_mode ", + sched->get_add_mode(), + ", expected ", + add_mode); + } + + /* check whether all partial schedules already have sync_entry at the tail */ for (auto& sched : partial_scheds) { - entry_factory::make_entry<sync_entry>(sched.get(), sync_obj); + if (sched->entries.empty()) { + add_sync_entry = true; + break; + } + + /* TODO: add enum field into base entry to distinguish different entry types */ + const char* tail_entry_name = (add_mode == ccl_sched_add_back) + ? sched->entries.back()->name() + : sched->entries.front()->name(); + + if (tail_entry_name && strcmp(tail_entry_name, "SYNC")) { + add_sync_entry = true; + break; + } + } + + /* if at least one partial schedule doesn't have sync entry + then sync all partial schedules */ + if (add_sync_entry) { + auto sync_obj = std::make_shared<sync_object>(partial_scheds.size()); + for (auto& sched : partial_scheds) { + entry_factory::make_entry<sync_entry>(sched.get(), sync_obj); + } } } @@ -119,7 +153,7 @@ void ccl_master_sched::dump(std::ostream& out) const { ccl_logger::format(out, ", req: ", static_cast<const ccl_request*>(this), - ", worker_sched count: ", + ", partial_scheds size: ", partial_scheds.size()); for (const auto& sched : partial_scheds) { @@ -209,7 +243,7 @@ ccl_master_sched::ccl_master_sched_ptr ccl_master_sched::create(const ccl_coll_p if (is_created) { sched->set_coll_attr(attr); - sched->alloc_buffers_for_sycl_copy(); + sched->alloc_buffers_for_pre_post_copy(); LOG_DEBUG("didn't find sched, create new one ", sched, ", type ", diff --git a/src/sched/queue/flow_control.cpp b/src/sched/queue/flow_control.cpp new file mode 100644 index 000000000..1c67546b6 --- /dev/null +++ b/src/sched/queue/flow_control.cpp @@ -0,0 +1,67 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "common/log/log.hpp" +#include "sched/queue/flow_control.hpp" + +namespace ccl { + +flow_control::flow_control() + : max_credits(CCL_MAX_FLOW_CREDITS), + min_credits(CCL_MAX_FLOW_CREDITS), + credits(CCL_MAX_FLOW_CREDITS) {} + +flow_control::~flow_control() { + LOG_DEBUG("max used credits: ", (max_credits - min_credits)); +} + +void flow_control::set_max_credits(size_t value) { + max_credits = min_credits = credits = value; +} + +size_t flow_control::get_max_credits() const { + return max_credits; +} + +size_t flow_control::get_credits() const { + return credits; +} + +bool flow_control::take_credit() { + if (credits) { + credits--; + CCL_THROW_IF_NOT( + credits >= 0, "unexpected credits ", credits, ", max_credits ", max_credits); + min_credits = std::min(min_credits, credits); + return true; + } + else { + LOG_TRACE("no available credits"); + return false; + } +} + +void flow_control::return_credit() { + credits++; + CCL_THROW_IF_NOT((credits > 0) && (credits <= max_credits) && (credits > min_credits), + "unexpected credits ", + credits, + ", max_credits ", + max_credits, + ", min_credits ", + min_credits); +} + +} // namespace ccl diff --git a/src/common/comm/l0/modules/kernel_params.hpp b/src/sched/queue/flow_control.hpp similarity index 59% rename from src/common/comm/l0/modules/kernel_params.hpp rename to src/sched/queue/flow_control.hpp index 43d850d84..2aced09c1 100644 --- a/src/common/comm/l0/modules/kernel_params.hpp +++ b/src/sched/queue/flow_control.hpp @@ -14,15 +14,26 @@ limitations under the License. */ #pragma once -#include "coll/algorithms/algorithms_enum.hpp" -template <class type> -struct kernel_params_default { - using native_type = type; -}; +namespace ccl { + +#define CCL_MAX_FLOW_CREDITS 1024 + +class flow_control { +public: + flow_control(); + ~flow_control(); -template <class native_data_type, ccl_coll_reduction reduction> -struct kernel_reduction_params_traits : kernel_params_default<native_data_type> { - using typename kernel_params_default<native_data_type>::native_type; - static constexpr ccl_coll_reduction red_type = reduction; + void set_max_credits(size_t value); + size_t get_max_credits() const; + size_t get_credits() const; + bool take_credit(); + void return_credit(); + +private: + size_t max_credits; + size_t min_credits; + size_t credits; }; + +} // namespace ccl diff --git a/src/sched/queue/queue.cpp b/src/sched/queue/queue.cpp index 1a9a4839d..8654e1470 100644 --- a/src/sched/queue/queue.cpp +++ b/src/sched/queue/queue.cpp @@ -77,11 +77,14 @@ ccl_sched_queue::ccl_sched_queue(size_t idx, std::vector<size_t> atl_eps) } ccl_sched_queue::~ccl_sched_queue() { - CCL_ASSERT(bins.empty(), "unexpected bins size ", bins.size(), ", expected 0"); + if (!bins.empty()) + LOG_WARN("unexpected bins size ", bins.size(), ", expected 0"); - CCL_ASSERT(max_priority == 0, "unexpected max_priority ", max_priority, ", expected 0"); + if (max_priority != 0) + LOG_WARN("unexpected max_priority ", max_priority, ", expected 0"); - CCL_ASSERT(!cached_max_priority_bin); + if (cached_max_priority_bin) + LOG_WARN("unexpected cached_max_priority_bin"); } void ccl_sched_queue::add(ccl_sched* sched) { diff --git a/src/sched/queue/queue.hpp b/src/sched/queue/queue.hpp index 010453b48..e25ed9e71 100644 --- a/src/sched/queue/queue.hpp +++ b/src/sched/queue/queue.hpp @@ -53,7 +53,7 @@ class ccl_sched_list { ~ccl_sched_list() { if (elems.size() != 0 && !ccl::global_data::get().is_ft_enabled) { - LOG_ERROR("unexpected elem_count ", elems.size(), ", expected 0"); + LOG_WARN("unexpected elem_count ", elems.size(), ", expected 0"); } for (size_t i = 0; i < elems.size(); i++) { diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp index 71a0880e2..2c6080867 100644 --- a/src/sched/sched.cpp +++ b/src/sched/sched.cpp @@ -182,6 +182,10 @@ ccl_request* ccl_sched::start_subsched(ccl_extra_sched* subsched) { return subsched->req; } +std::vector<ccl::event>& ccl_sched::get_deps() const { + return static_cast<ccl_master_sched*>(req)->coll_param.deps; +} + void ccl_sched::dump(std::ostream& out) const { if (!ccl::global_data::env().sched_dump) { return; @@ -195,6 +199,10 @@ void ccl_sched::dump(std::ostream& out) const { entries.size(), ", priority: ", get_priority(), + ", max_flow_credits: ", + flow_control.get_max_credits(), + ", flow_credits: ", + flow_control.get_credits(), "\n"); std::stringstream msg; diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp index d8f700cd9..f390bc6bd 100644 --- a/src/sched/sched.hpp +++ b/src/sched/sched.hpp @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once + #include "sched/sched_base.hpp" +#include "sched/queue/flow_control.hpp" #include "internal_types.hpp" //todo: sequence diagram @@ -139,6 +141,8 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base { ccl_request* start_subsched(ccl_extra_sched* subsched); + std::vector<ccl::event>& get_deps() const; + ccl_sched_bin* bin = nullptr; /* valid only during execution */ ccl_sched_queue* queue = nullptr; /* cached pointer to queue, valid even after execution */ size_t start_idx = 0; /* index to start */ @@ -159,6 +163,12 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base { /* currently applicable for start phase only */ bool strict_order; + /* + limits number of active entries + mostly makes sense for ATL entries + */ + ccl::flow_control flow_control; + void set_finalize_fn(ccl_sched_finalize_fn_t fn, void* ctx) { finalize_fn = fn; finalize_fn_ctx = ctx; diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp index 5ca552cf8..6803ea257 100644 --- a/src/sched/sched_base.cpp +++ b/src/sched/sched_base.cpp @@ -13,8 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "sched/sched_base.hpp" +#include <numeric> + +#include "coll/algorithms/algorithms_enum.hpp" +#include "coll/coll_param.hpp" #include "common/global/global.hpp" +#include "sched/sched_base.hpp" std::string to_string(ccl_sched_add_mode mode) { switch (mode) { @@ -32,14 +36,19 @@ void ccl_sched_base::set_coll_attr(const ccl_coll_attr& attr) { void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param, const ccl_coll_attr& attr) { #ifdef CCL_ENABLE_SYCL + copy_deps(param.deps, coll_param.deps); if (param.stream && param.stream->is_sycl_device_stream()) { - coll_param.sycl_buf = static_cast<ccl_sycl_buffer_t*>(param.buf); - coll_param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - coll_param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); + /* update device buffers only if they are already non-null + i.e. were set on previous call */ + if (coll_param.device_send_buf) { + coll_param.device_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); + } + if (coll_param.device_recv_buf) { + coll_param.device_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); + } } else { #endif /* CCL_ENABLE_SYCL */ - coll_param.buf = param.buf; coll_param.send_buf = param.send_buf; coll_param.recv_buf = param.recv_buf; #ifdef CCL_ENABLE_SYCL @@ -106,7 +115,7 @@ ccl_buffer ccl_sched_base::alloc_buffer(size_t bytes) { CCL_THROW_IF_NOT(bytes > 0, "incorrect buffer size: ", bytes); ccl_buffer buffer = - ccl_buffer(CCL_CALLOC(bytes, "sched_buffer"), bytes, 0, ccl_buffer_type::DIRECT); + ccl_buffer(CCL_MALLOC(bytes, "sched_buffer"), bytes, 0, ccl_buffer_type::DIRECT); memory.buf_list.emplace_back(buffer, bytes); CCL_THROW_IF_NOT(buffer.get_ptr(), "null ptr"); @@ -252,81 +261,72 @@ void ccl_sched_base::add_memory_region(atl_mr_t* mr) { memory.mr_list.emplace_back(mr); } -void ccl_sched_base::alloc_buffers_for_sycl_copy() { +void ccl_sched_base::alloc_buffers_for_pre_post_copy() { #ifdef CCL_ENABLE_SYCL - ccl_coll_param& param = coll_param; + param.device_send_buf = param.device_recv_buf = nullptr; + if (!param.stream || (!param.stream->is_sycl_device_stream())) return; - LOG_DEBUG("alloc tmp buffers for D2H and H2D copies, coll_type ", - ccl_coll_type_to_str(param.ctype), - ", dtype_size ", - param.dtype.size(), - ", comm_size ", - param.comm->size(), - ", count ", - param.count); + // check both recv and send buffers, for some algorithms(i.e. alltoallv) one of them is allowed to + // be invalid(i.e. unknown return type) as long as the corresponding count is 0 so we won't dereference it. + // TODO: should we add a special handling for case when both buffers are invalid? + auto send_ptr_type = sycl::get_pointer_type((void*)param.send_buf, + param.stream->get_native_stream().get_context()); + auto recv_ptr_type = + sycl::get_pointer_type(param.recv_buf, param.stream->get_native_stream().get_context()); + + // TODO: we currently don't correctly handle cases when there are 2 different types at the same time + // i.e. device memory for send buffer and shared memory for recv buffer + bool should_alloc_buffers = true; + if ((send_ptr_type == sycl::usm::alloc::shared || recv_ptr_type == sycl::usm::alloc::shared) || + ((send_ptr_type == sycl::usm::alloc::device || recv_ptr_type == sycl::usm::alloc::device) && + atl_wrapper::attr.out.enable_device_buf)) { + should_alloc_buffers = false; + } + + if (!should_alloc_buffers) { + return; + } - size_t idx, send_count = 0, recv_count = 0; + param.device_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); + param.device_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); + param.send_buf = param.recv_buf = nullptr; + + size_t send_alloc_count = 0, recv_alloc_count = 0; switch (param.ctype) { case ccl_coll_allgatherv: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - param.send_buf = alloc_staging_buffer(param.send_count * param.dtype.size()).get_ptr(); - for (idx = 0; idx < param.comm->size(); idx++) - recv_count += param.recv_counts[idx]; - param.recv_buf = alloc_staging_buffer(recv_count * param.dtype.size()).get_ptr(); + send_alloc_count = param.send_count; + recv_alloc_count = + std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0); break; case ccl_coll_allreduce: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - param.send_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); - param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); + /* use in-place to avoid allocation of extra staging buffer*/ + send_alloc_count = 0; + recv_alloc_count = param.count; break; case ccl_coll_alltoall: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - param.send_buf = - alloc_staging_buffer(param.count * param.dtype.size() * param.comm->size()) - .get_ptr(); - param.recv_buf = - alloc_staging_buffer(param.count * param.dtype.size() * param.comm->size()) - .get_ptr(); + send_alloc_count = recv_alloc_count = param.count * param.comm->size(); break; case ccl_coll_alltoallv: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - for (idx = 0; idx < param.comm->size(); idx++) { - send_count += param.send_counts[idx]; - recv_count += param.recv_counts[idx]; - } - param.send_buf = alloc_staging_buffer(send_count * param.dtype.size()).get_ptr(); - param.recv_buf = alloc_staging_buffer(recv_count * param.dtype.size()).get_ptr(); + send_alloc_count = + std::accumulate(param.send_counts, param.send_counts + param.comm->size(), 0); + recv_alloc_count = + std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0); break; case ccl_coll_bcast: - param.sycl_buf = static_cast<ccl_sycl_buffer_t*>(param.buf); - param.buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); + send_alloc_count = 0; + recv_alloc_count = param.count; break; case ccl_coll_reduce: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)(param.send_buf)); - param.send_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); - if (param.comm->rank() == param.root) { - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); - } - else { - param.recv_buf = nullptr; - } + send_alloc_count = param.count; + recv_alloc_count = (param.comm->rank() == param.root) ? param.count : 0; break; case ccl_coll_reduce_scatter: - param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf); - param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf); - param.send_buf = - alloc_staging_buffer(param.count * param.comm->size() * param.dtype.size()) - .get_ptr(); - param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr(); + send_alloc_count = param.count * param.comm->size(); + recv_alloc_count = param.count; break; case ccl_coll_sparse_allreduce: CCL_FATAL("SYCL stream is not supported for sparse_allreduce yet"); @@ -334,6 +334,27 @@ void ccl_sched_base::alloc_buffers_for_sycl_copy() { break; default: break; } + + LOG_DEBUG("alloc tmp buffers for D2H and H2D copies, coll_type ", + ccl_coll_type_to_str(param.ctype), + ", dtype_size ", + param.dtype.size(), + ", comm_size ", + param.comm->size(), + ", count ", + param.count); + + if (send_alloc_count) { + param.send_buf = alloc_staging_buffer(send_alloc_count * param.dtype.size()).get_ptr(); + } + + if (recv_alloc_count) { + param.recv_buf = alloc_staging_buffer(recv_alloc_count * param.dtype.size()).get_ptr(); + + if (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_bcast) { + param.send_buf = param.recv_buf; + } + } #endif /* CCL_ENABLE_SYCL */ } diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp index 43af2ec8d..80edcb737 100644 --- a/src/sched/sched_base.hpp +++ b/src/sched/sched_base.hpp @@ -102,12 +102,16 @@ struct ccl_sched_base { void add_memory_region(atl_mr_t* mr); - void alloc_buffers_for_sycl_copy(); + void alloc_buffers_for_pre_post_copy(); void set_entry_exec_mode(ccl_sched_entry_exec_mode mode) { exec_mode = mode; } + ccl_sched_add_mode get_add_mode() { + return add_mode; + } + void set_add_mode(ccl_sched_add_mode mode) { add_mode = mode; } @@ -129,6 +133,10 @@ struct ccl_sched_base { protected: ~ccl_sched_base() = default; + ccl_sched_base() { + CCL_THROW("unsupported"); + } + ccl_sched_base(const ccl_coll_param& coll_param) : coll_param(coll_param) {} void update_id(); diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp index d225f3aba..ade72119e 100644 --- a/src/unordered_coll/unordered_coll.cpp +++ b/src/unordered_coll/unordered_coll.cpp @@ -197,7 +197,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id) ccl_coll_entry_param match_id_size_param{}; match_id_size_param.ctype = ccl_coll_bcast; - match_id_size_param.buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t)); + match_id_size_param.recv_buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t)); match_id_size_param.count = sizeof(size_t); match_id_size_param.dtype = ccl_datatype_int8; match_id_size_param.root = CCL_UNORDERED_COLL_COORDINATOR; @@ -209,14 +209,14 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id) /* 2. broadcast match_id_value */ ccl_coll_entry_param match_id_val_param{}; match_id_val_param.ctype = ccl_coll_bcast; - match_id_val_param.buf = ccl_buffer(); + match_id_val_param.recv_buf = ccl_buffer(); match_id_val_param.count = 0; match_id_val_param.dtype = ccl_datatype_int8; match_id_val_param.root = CCL_UNORDERED_COLL_COORDINATOR; match_id_val_param.comm = coll_param.comm; auto entry = entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_val_param); - entry->set_field_fn<ccl_sched_entry_field_buf>( + entry->set_field_fn<ccl_sched_entry_field_recv_buf>( [](const void* fn_ctx, void* field_ptr) { auto ctx = static_cast<ccl_unordered_coll_ctx*>(const_cast<void*>(fn_ctx)); if (ctx->service_sched->coll_param.comm->rank() != CCL_UNORDERED_COLL_COORDINATOR) { @@ -244,7 +244,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id) /* 3. broadcast reserved comm_id */ ccl_coll_entry_param reserved_comm_id_param{}; reserved_comm_id_param.ctype = ccl_coll_bcast; - reserved_comm_id_param.buf = ccl_buffer(&ctx->reserved_comm_id, sizeof(ccl_comm_id_t)); + reserved_comm_id_param.recv_buf = ccl_buffer(&ctx->reserved_comm_id, sizeof(ccl_comm_id_t)); reserved_comm_id_param.count = sizeof(ccl_comm_id_t); reserved_comm_id_param.dtype = ccl_datatype_int8; reserved_comm_id_param.root = CCL_UNORDERED_COLL_COORDINATOR; diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt index e975583b8..fb6416133 100644 --- a/tests/functional/CMakeLists.txt +++ b/tests/functional/CMakeLists.txt @@ -14,23 +14,38 @@ # limitations under the License. # cmake_minimum_required (VERSION 2.8) + +if (POLICY CMP0048) + cmake_policy(SET CMP0048 OLD) +endif (POLICY CMP0048) + file(GLOB sources "*_test.c" "*_test.cpp") -if (NOT DEFINED LP_ENV_DEFINED) - include(${PROJECT_SOURCE_DIR}/../../cmake/helpers.cmake) +set(PROJECT_NAME "oneCCL functional tests") +project(${PROJECT_NAME}) + +message(STATUS "FT CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}") +message(STATUS "FT PROJECT_NAME: ${PROJECT_NAME}") + +#set default build type +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") +endif() + +# standalone build +if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME}) + set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/../../cmake) + include(${COMMON_CMAKE_DIR}/helpers.cmake) set_lp_env() + if (COMPUTE_BACKEND) + set_compute_backend(${COMMON_CMAKE_DIR}) + endif() endif() set(SERVICE_SRC conf.cpp - lp.cpp) - -if (DEFINED ENV{CCL_CONFIGURATION}) - set(CCL_CONFIGURATION "$ENV{CCL_CONFIGURATION}") - if(${CCL_CONFIGURATION} STREQUAL "cpu_gpu_dpcpp") - set(COMPUTE_BACKEND "dpcpp_level_zero") - endif() -endif() + lp.cpp + transport.cpp) if (DEFINED ENV{CCL_ROOT}) set(CCL_ROOT "$ENV{CCL_ROOT}") @@ -38,30 +53,31 @@ endif() set(CCL_INSTALL_TESTS "$ENV{PWD}") enable_testing() -ADD_SUBDIRECTORY (googletest-release-1.8.1/googletest/) + +set(GTEST_DIR ${PROJECT_SOURCE_DIR}/../googletest-release-1.8.1/googletest) +add_subdirectory(${GTEST_DIR} gtest_build) +set(EXAMPLES_DIR ${PROJECT_SOURCE_DIR}/../../examples) set(INC_DIRS - ${PROJECT_SOURCE_DIR}/tests/functional/googletest-release-1.8.1/googletest/include - ${PROJECT_SOURCE_DIR}/tests/functional/googletest-release-1.8.1/googletest/src - ${PROJECT_SOURCE_DIR}/include) + ${GTEST_DIR}/include + ${GTEST_DIR}/src + ${EXAMPLES_DIR}/include) include_directories(${INC_DIRS}) -message(STATUS "CCL_ROOT: ${CCL_ROOT}") -message(STATUS "CCL_CONFIGURATION: ${CCL_CONFIGURATION}") -message(STATUS "tests/functional LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}") -message(STATUS "tests/functional LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}") -message(STATUS "INC_DIRS: ${INC_DIRS}") - -#include_directories(${CCL_ROOT}/include/${CCL_CONFIGURATION}) -#link_directories(${CCL_ROOT}/lib/${CCL_CONFIGURATION}) +message(STATUS "FT build type: ${CMAKE_BUILD_TYPE}") +message(STATUS "FT CCL_ROOT: ${CCL_ROOT}") +message(STATUS "FT INC_DIRS: ${INC_DIRS}") +message(STATUS "FT COMPUTE_BACKEND: ${COMPUTE_BACKEND}") if (${CMAKE_VERSION} VERSION_LESS 3.1) #cmake version below 3.1 does not support CMAKE_C[XX}_STANDARD flags #set manually +# TODO: Need to handle c++17 option for older cmake set(CXX_COMPILER_FLAGS "-std=gnu++11") set(C_COMPILER_FLAGS "-std=gnu99") endif() + #common release/debug compilation settings set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG") @@ -74,11 +90,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Werror -D_G set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g") -set(CMAKE_CXX_STANDARD 11) +# C++ standard version is set by set_compute_backend/activate_compute_backend, so no need to set it here set(CMAKE_CXX_STANDARD_REQUIRED ON) if (COMPUTE_BACKEND) - activate_compute_backend("${CCL_ROOT}/lib;${PROJECT_SOURCE_DIR}/cmake" ${COMPUTE_BACKEND}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${COMPUTE_BACKEND_LIBRARIES}") endif() @@ -98,7 +113,11 @@ foreach(src ${sources}) target_link_libraries(${executable} PUBLIC rt) target_link_libraries(${executable} PUBLIC m) target_link_libraries(${executable} PUBLIC dl) - target_link_libraries(${executable} PRIVATE m) + # w/a for ats with 2 mpi lib, should be fixed + if (DEFINED ENV{I_MPI_ROOT}) + set(I_MPI_ROOT "$ENV{I_MPI_ROOT}") + endif() + target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/) target_link_libraries(${executable} PUBLIC mpi) install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL) add_test (NAME ${executable} CONFIGURATIONS default COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml) @@ -145,3 +164,8 @@ endforeach() foreach(algo direct; ring) add_test (NAME reduce_scatter_${algo} CONFIGURATIONS reduce_scatter_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/reduce_scatter_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_scatter_${algo}_report.junit.xml) endforeach() + +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + # right now all regression tests require dpcpp, might be changed in the future + add_subdirectory(regression) +endif() diff --git a/tests/functional/allgatherv_test.cpp b/tests/functional/allgatherv_test.cpp index f4fccd792..25020ef43 100644 --- a/tests/functional/allgatherv_test.cpp +++ b/tests/functional/allgatherv_test.cpp @@ -15,7 +15,7 @@ */ #define ALGO_SELECTION_ENV "CCL_ALLGATHERV" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class allgatherv_test : public base_test<T> { @@ -50,15 +50,12 @@ class allgatherv_test : public base_test<T> { offsets[rank] = recv_counts[rank - 1] + offsets[rank - 1]; } - if (op.param.place_type == PLACE_OUT) { - size_t total_recv_count = std::accumulate(recv_counts.begin(), recv_counts.end(), 0); - for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { - op.recv_bufs[buf_idx].resize(total_recv_count); - if (is_lp_datatype(op.param.datatype)) { - op.recv_bufs_lp[buf_idx].resize(total_recv_count); - } - } - } + // if (op.param.place_type == PLACE_OUT) { + // size_t total_recv_count = std::accumulate(recv_counts.begin(), recv_counts.end(), 0); + // for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + // op.recv_bufs[buf_idx].resize(total_recv_count); + // } + // } } void fill_send_buffers(test_operation<T>& op) { @@ -99,7 +96,8 @@ class allgatherv_test : public base_test<T> { recv_buf, recv_counts, op.datatype, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/allreduce_test.cpp b/tests/functional/allreduce_test.cpp index 200568c5c..146a54458 100644 --- a/tests/functional/allreduce_test.cpp +++ b/tests/functional/allreduce_test.cpp @@ -15,7 +15,7 @@ */ #define ALGO_SELECTION_ENV "CCL_ALLREDUCE" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class allreduce_test : public base_test<T> { @@ -49,7 +49,8 @@ class allreduce_test : public base_test<T> { op.elem_count, op.datatype, op.reduction, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/alltoall_test.cpp b/tests/functional/alltoall_test.cpp index 5f9bd2cfe..ed7c860af 100644 --- a/tests/functional/alltoall_test.cpp +++ b/tests/functional/alltoall_test.cpp @@ -15,7 +15,7 @@ */ #define ALGO_SELECTION_ENV "CCL_ALLTOALL" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class alltoall_test : public base_test<T> { @@ -59,7 +59,8 @@ class alltoall_test : public base_test<T> { recv_buf, op.elem_count, op.datatype, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/alltoallv_test.cpp b/tests/functional/alltoallv_test.cpp index 4b4c2ca02..0d4565865 100644 --- a/tests/functional/alltoallv_test.cpp +++ b/tests/functional/alltoallv_test.cpp @@ -15,7 +15,7 @@ */ #define ALGO_SELECTION_ENV "CCL_ALLTOALLV" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class alltoallv_test : public base_test<T> { @@ -94,7 +94,8 @@ class alltoallv_test : public base_test<T> { recv_buf, recv_counts, op.datatype, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/bcast_test.cpp b/tests/functional/bcast_test.cpp index 0b7e3eb0b..8472d7f96 100644 --- a/tests/functional/bcast_test.cpp +++ b/tests/functional/bcast_test.cpp @@ -16,7 +16,7 @@ #define ALGO_SELECTION_ENV "CCL_BCAST" #define BCAST_VALUE_COEFF 128 -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class bcast_test : public base_test<T> { @@ -52,7 +52,8 @@ class bcast_test : public base_test<T> { op.elem_count, op.datatype, ROOT_RANK, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/conf.cpp b/tests/functional/conf.cpp index 5199d4f31..cee7b2ae3 100644 --- a/tests/functional/conf.cpp +++ b/tests/functional/conf.cpp @@ -232,6 +232,12 @@ void init_test_dims() { void init_test_params() { init_test_dims(); +#ifdef CCL_ENABLE_SYCL + printf("FUNC_TESTS: CCL_ENABLE_SYCL ON\n"); +#endif + printf("FUNC_TESTS: BF16 enabled %d\n", is_bf16_enabled()); + printf("FUNC_TESTS: FP16 enabled %d\n", is_fp16_enabled()); + for (auto data_type = first_data_type; data_type < last_data_type; data_type++) { if (should_skip_datatype(data_type)) continue; @@ -303,10 +309,9 @@ std::ostream& operator<<(std::ostream& stream, const test_param& param) { } void print_err_message(char* message, std::ostream& output) { - ccl::communicator& comm = global_data::instance().comms[0]; + auto& comm = transport_data::instance().get_service_comm(); int comm_size = comm.size(); int comm_rank = comm.rank(); - size_t message_len = strlen(message); std::vector<size_t> message_lens(comm_size, 0); std::vector<size_t> recv_counts(comm_size, 1); @@ -326,11 +331,3 @@ void print_err_message(char* message, std::ostream& output) { output << messages.data(); } } - -void mpi_finalize() { - int is_finalized = 0; - MPI_Finalized(&is_finalized); - - if (!is_finalized) - MPI_Finalize(); -} diff --git a/tests/functional/conf.hpp b/tests/functional/conf.hpp index 7e04a7999..1d7435fdd 100644 --- a/tests/functional/conf.hpp +++ b/tests/functional/conf.hpp @@ -175,4 +175,3 @@ ccl::reduction get_ccl_reduction(const test_param& param); void init_test_dims(); void init_test_params(); void print_err_message(char* err_message, std::ostream& output); -void mpi_finalize(); diff --git a/tests/functional/lp.cpp b/tests/functional/lp.cpp index 1d4b482b9..8e7b28e6e 100644 --- a/tests/functional/lp.cpp +++ b/tests/functional/lp.cpp @@ -30,7 +30,6 @@ int is_fp16_enabled() { : "a"(1)); is_fp16_enabled = (reg[2] & (1 << 29)) >> 29; } - printf("FUNC_TESTS: FP16 compiler, is_fp16_enabled %d\n", is_fp16_enabled); return is_fp16_enabled; #else printf("FUNC_TESTS: no FP16 compiler\n"); @@ -50,7 +49,6 @@ int is_bf16_enabled() { is_bf16_enabled = ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31); } - printf("FUNC_TESTS: BF16 compiler, is_bf16_enabled %d\n", is_bf16_enabled); return is_bf16_enabled; #else printf("FUNC_TESTS: no BF16 compiler\n"); @@ -97,7 +95,7 @@ void convert_fp16_to_fp32(const void* src, void* dst) { void convert_fp32_to_bf16(const void* src, void* dst) { #ifdef CCL_BF16_AVX512BF_COMPILER if (is_avx512bf_enabled()) - _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src))); + _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src))); else #endif _mm256_storeu_si256((__m256i*)(dst), diff --git a/tests/functional/lp.hpp b/tests/functional/lp.hpp index a34a6d43b..9396d704e 100644 --- a/tests/functional/lp.hpp +++ b/tests/functional/lp.hpp @@ -20,8 +20,8 @@ #endif #include <math.h> -#include "base.hpp" #include "conf.hpp" +#include "test.hpp" template <typename T> struct test_operation; @@ -40,13 +40,13 @@ int is_bf16_enabled(); int is_avx512bf_enabled(); #ifdef CCL_FP16_TARGET_ATTRIBUTES -void convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c,avx512f"))); +void convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c"))); #else void convert_fp32_to_fp16(const void* src, void* dst); #endif #ifdef CCL_FP16_TARGET_ATTRIBUTES -void convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c,avx512f"))); +void convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c"))); #else void convert_fp16_to_fp32(const void* src, void* dst); #endif diff --git a/tests/functional/lp_impl.hpp b/tests/functional/lp_impl.hpp index 0d4c2dd76..d7b6f5173 100644 --- a/tests/functional/lp_impl.hpp +++ b/tests/functional/lp_impl.hpp @@ -18,7 +18,7 @@ template <typename T> void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_type dtype) { size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256; - short tail[FLOATS_IN_M512] = { 0 }; + short tail[floats_in_reg]; for (size_t i = 0; i < count; i += floats_in_reg) { if (i / floats_in_reg == count / floats_in_reg) { @@ -36,7 +36,7 @@ void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_typ template <typename T> void convert_lp_to_fp32_arrays(short* lp_buf, T* buf, size_t count, ccl_data_type dtype) { size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256; - T tail[FLOATS_IN_M512] = { 0 }; + T tail[floats_in_reg]; for (size_t i = 0; i < count; i += floats_in_reg) { if (i / floats_in_reg == count / floats_in_reg) { @@ -55,16 +55,9 @@ template <typename T> void make_lp_prologue(test_operation<T>& op, size_t count) { ccl_data_type dtype = op.param.datatype; for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { - if (op.param.place_type == PLACE_IN) { - T* recv_buf_fp32 = op.recv_bufs[buf_idx].data(); - short* recv_bufs_lp = op.recv_bufs_lp[buf_idx].data(); - convert_fp32_to_lp_arrays(recv_buf_fp32, recv_bufs_lp, count, dtype); - } - else { - T* send_buf_fp32 = op.send_bufs[buf_idx].data(); - short* send_bufs_lp = op.send_bufs_lp[buf_idx].data(); - convert_fp32_to_lp_arrays(send_buf_fp32, send_bufs_lp, count, dtype); - } + T* buf = (op.param.place_type == PLACE_IN) ? op.recv_bufs[buf_idx].data() + : op.send_bufs[buf_idx].data(); + convert_fp32_to_lp_arrays(buf, (short*)buf, count, dtype); } } @@ -72,8 +65,7 @@ template <typename T> void make_lp_epilogue(test_operation<T>& op, size_t count) { ccl_data_type dtype = op.param.datatype; for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { - T* recv_buf_fp32 = op.recv_bufs[buf_idx].data(); - short* recv_bufs_lp = op.recv_bufs_lp[buf_idx].data(); - convert_lp_to_fp32_arrays(recv_bufs_lp, recv_buf_fp32, count, dtype); + std::vector<T> tmp(op.recv_bufs[buf_idx]); + convert_lp_to_fp32_arrays((short*)tmp.data(), op.recv_bufs[buf_idx].data(), count, dtype); } } diff --git a/tests/functional/reduce_scatter_test.cpp b/tests/functional/reduce_scatter_test.cpp index 4c0836060..14950c444 100644 --- a/tests/functional/reduce_scatter_test.cpp +++ b/tests/functional/reduce_scatter_test.cpp @@ -15,13 +15,13 @@ */ #define ALGO_SELECTION_ENV "CCL_REDUCE_SCATTER" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class reduce_scatter_test : public base_test<T> { public: int check(test_operation<T>& op) { - size_t my_rank = global_data::instance().comms[0].rank(); + int my_rank = transport_data::instance().get_rank(); for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { for (size_t elem_idx = 0; elem_idx < op.elem_count; elem_idx += op.get_check_step(elem_idx)) { @@ -52,7 +52,8 @@ class reduce_scatter_test : public base_test<T> { op.elem_count, op.datatype, op.reduction, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/reduce_test.cpp b/tests/functional/reduce_test.cpp index 91d8534d7..0fdf5ea0c 100644 --- a/tests/functional/reduce_test.cpp +++ b/tests/functional/reduce_test.cpp @@ -15,7 +15,7 @@ */ #define ALGO_SELECTION_ENV "CCL_REDUCE" -#include "base_impl.hpp" +#include "test_impl.hpp" template <typename T> class reduce_test : public base_test<T> { @@ -53,7 +53,8 @@ class reduce_test : public base_test<T> { op.datatype, op.reduction, ROOT_RANK, - global_data::instance().comms[0], + transport_data::instance().get_comm(), + transport_data::instance().get_stream(), attr)); } } diff --git a/tests/functional/regression/CMakeLists.txt b/tests/functional/regression/CMakeLists.txt new file mode 100644 index 000000000..c17eac8a3 --- /dev/null +++ b/tests/functional/regression/CMakeLists.txt @@ -0,0 +1,30 @@ +# +# Copyright 2016-2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +set(sources alltoallv_empty_count.cpp) + +set(CCL_INSTALL_TESTS "${CMAKE_CURRENT_BINARY_DIR}") + +message(WARNING $"TEST DIR: ${CCL_INSTALL_TESTS}") + +foreach(src ${sources}) + get_filename_component(executable ${src} NAME_WE) + add_executable(${executable} ${src}) + target_link_libraries(${executable} PRIVATE ccl gtest_main gtest mpi) + + install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL) + add_test (NAME ${executable} CONFIGURATIONS regression COMMAND mpiexec.hydra -l -n 3 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml) + +endforeach(src ${sources}) diff --git a/tests/functional/regression/alltoallv_empty_count.cpp b/tests/functional/regression/alltoallv_empty_count.cpp new file mode 100644 index 000000000..23e8409d8 --- /dev/null +++ b/tests/functional/regression/alltoallv_empty_count.cpp @@ -0,0 +1,163 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include <numeric> +#include <vector> +#include <iostream> + +#include "oneapi/ccl.hpp" +#include "gtest/gtest.h" +#include "mpi.h" + +class alltoallv_test : public ::testing::Test { +protected: + void SetUp() override { + ccl::init(); + + MPI_Init(NULL, NULL); + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } + + void TearDown() override { + // Don't do finalize if the case has failed, this + // could lead to a deadlock due to inconsistent state. + if (HasFatalFailure()) { + return; + } + + int is_finalized = 0; + MPI_Finalized(&is_finalized); + + if (!is_finalized) + MPI_Finalize(); + } + + int size; + int rank; +}; + +// there are 3 ranks, rank 0 is able to send and receive data to/from others(its send and receive total count > 0) +// rank 1 only sends data but not receives it(its recv_count == 0 for all ranks), and rank 2 only receives data but +// not sends it. +// also rank 1 sets its recv_buf to nullptr(it's not used anyway due to 0 recv count), the same is done on rank 2 for send buf +// in the testcase we simply run alltoallv with these parameters and after that check that both rank 0 and rank 2 received +// the correct data. +// TODO: once we add more tests, move some common parts out of this test +TEST_F(alltoallv_test, alltoallv_empty_recv_count) { + const size_t count = 1000; + + int i = 0; + + ASSERT_EQ(size, 3) << "Test expects 3 ranks"; + + sycl::queue q; + ASSERT_TRUE(q.get_device().is_gpu()) + << "Test expects gpu device, please use SYCL_DEVICE_FILTER accordingly"; + + /* create communicator */ + auto dev = ccl::create_device(q.get_device()); + auto ctx = ccl::create_context(q.get_context()); + auto comm = ccl::create_communicator(size, rank, dev, ctx, /*kvs*/ {}); + + /* create stream */ + auto stream = ccl::create_stream(q); + + // TODO: find a proper way to choose between shared and device pointers(i.e. env variable) + /* create buffers */ + auto send_buf = sycl::malloc_device<int>(count * size, q); + auto recv_buf = sycl::malloc_device<int>(count * size, q); + + // we have 2 ranks in total: rank 1 doesn't receive anything, rank 2 - doesn't send anything + int empty_recv_rank = 1; + int empty_send_rank = 2; + + std::vector<size_t> send_counts(size, count); + std::vector<size_t> recv_counts(size, count); + + // update counts so the corresponding rank doesn't receive anything and others doesn't send anything to it + send_counts[empty_recv_rank] = 0; + if (rank == empty_recv_rank) { + std::fill(recv_counts.begin(), recv_counts.end(), 0); + } + + recv_counts[empty_send_rank] = 0; + if (rank == empty_send_rank) { + std::fill(send_counts.begin(), send_counts.end(), 0); + } + q.memset(recv_buf, 0, count * size).wait(); + + std::vector<sycl::event> events; + size_t offset = 0; + for (int i = 0; i < send_counts.size(); ++i) { + auto e = q.submit([&](auto& h) { + h.parallel_for(send_counts[i], [=](auto id) { + send_buf[id + offset] = i + 1; + }); + }); + offset += send_counts[i]; + events.push_back(e); + } + + // do not wait completion of kernel and provide it as dependency for operation + std::vector<ccl::event> deps; + for (auto e : events) { + deps.push_back(ccl::create_event(e)); + } + + // invoke alltoall + auto attr = ccl::create_operation_attr<ccl::alltoallv_attr>(); + int* invalid_ptr = (int*)0x00ffff; + // pass an invalid pointer to make sure it's correctly handled and not dereferenced due to 0 count + if (rank == empty_recv_rank) { + recv_buf = invalid_ptr; + } + else if (rank == empty_send_rank) { + send_buf = invalid_ptr; + } + + ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream, attr, deps).wait(); + + // if our rank is the one that didn't receive anything, than just exit and don't do any checking + if (rank == empty_recv_rank) + return; + + size_t total_recv = std::accumulate(recv_counts.begin(), recv_counts.end(), 0); + + sycl::buffer<int> check_buf(count * size); + q.submit([&](auto& h) { + sycl::accessor check_buf_acc(check_buf, h, sycl::write_only); + h.parallel_for(total_recv, [=, rnk = rank](auto id) { + // we expect that size - 1 chunks are properly filled with data and the last one is + // unchanged as we have one rank that doesn't send anything + if (recv_buf[id] != rnk + 1) { + check_buf_acc[id] = -1; + } + else { + check_buf_acc[id] = 0; + } + }); + }).wait_and_throw(); + + /* print out the result of the test on the host side */ + { + sycl::host_accessor check_buf_acc(check_buf, sycl::read_only); + for (i = 0; i < total_recv; i++) { + ASSERT_NE(check_buf_acc[i], -1) << "Check failed for receive buffer"; + } + } + + return; +} diff --git a/tests/functional/base.hpp b/tests/functional/test.hpp similarity index 78% rename from tests/functional/base.hpp rename to tests/functional/test.hpp index b42f61fe1..c8c9e01c1 100644 --- a/tests/functional/base.hpp +++ b/tests/functional/test.hpp @@ -23,24 +23,7 @@ #include "oneapi/ccl.hpp" #include "conf.hpp" - -class global_data { -public: - std::vector<ccl::communicator> comms; - ccl::shared_ptr_class<ccl::kvs> kvs; - - global_data(global_data& gd) = delete; - void operator=(const global_data&) = delete; - static global_data& instance() { - static global_data gd; - return gd; - } - -protected: - global_data(){}; - ~global_data(){}; -}; - +#include "transport.hpp" #include "utils.hpp" bool is_lp_datatype(ccl_data_type dtype); @@ -62,9 +45,10 @@ struct test_operation { std::vector<std::vector<T>> send_bufs; std::vector<std::vector<T>> recv_bufs; - // buffers for 16-bits low precision datatype - std::vector<std::vector<short>> send_bufs_lp; - std::vector<std::vector<short>> recv_bufs_lp; +#ifdef CCL_ENABLE_SYCL + std::vector<void*> device_send_bufs; + std::vector<void*> device_recv_bufs; +#endif /* CCL_ENABLE_SYCL */ std::vector<ccl::event> events; ccl::string_class match_id; @@ -75,8 +59,8 @@ struct test_operation { buffer_count(get_buffer_count(param)), datatype(get_ccl_datatype(param)), reduction(get_ccl_reduction(param)) { - comm_size = global_data::instance().comms[0].size(); - comm_rank = global_data::instance().comms[0].rank(); + comm_size = transport_data::instance().get_comm().size(); + comm_rank = transport_data::instance().get_comm().rank(); buf_indexes.resize(buffer_count); } @@ -84,9 +68,7 @@ struct test_operation { void prepare_attr(coll_attr_type& coll_attr, size_t idx); std::string create_match_id(size_t buf_idx); - void change_buffer_pointers(); size_t generate_priority_value(size_t buf_idx); - void define_start_order(std::default_random_engine& rand_engine); bool complete_events(); @@ -99,17 +81,19 @@ struct test_operation { void print(std::ostream& output); void* get_send_buf(size_t buf_idx) { - if (is_lp_datatype(param.datatype)) - return static_cast<void*>(send_bufs_lp[buf_idx].data()); - else - return static_cast<void*>(send_bufs[buf_idx].data()); +#ifdef CCL_ENABLE_SYCL + return device_send_bufs[buf_idx]; +#else /* CCL_ENABLE_SYCL */ + return send_bufs[buf_idx].data(); +#endif /* CCL_ENABLE_SYCL */ } void* get_recv_buf(size_t buf_idx) { - if (is_lp_datatype(param.datatype)) - return static_cast<void*>(recv_bufs_lp[buf_idx].data()); - else - return static_cast<void*>(recv_bufs[buf_idx].data()); +#ifdef CCL_ENABLE_SYCL + return device_recv_bufs[buf_idx]; +#else /* CCL_ENABLE_SYCL */ + return recv_bufs[buf_idx].data(); +#endif /* CCL_ENABLE_SYCL */ } size_t get_check_step(size_t elem_idx) { @@ -136,10 +120,7 @@ struct test_operation { template <typename T> class base_test { public: - int global_comm_rank; - int global_comm_size; char err_message[ERR_MESSAGE_MAX_LEN]{}; - std::random_device rand_device; std::default_random_engine rand_engine; @@ -151,12 +132,16 @@ class base_test { void alloc_buffers_base(test_operation<T>& op); virtual void alloc_buffers(test_operation<T>& op); + void free_buffers(test_operation<T>& op); - void fill_send_buffers_base(test_operation<T>& op); virtual void fill_send_buffers(test_operation<T>& op); - - void fill_recv_buffers_base(test_operation<T>& op); virtual void fill_recv_buffers(test_operation<T>& op); + void change_buffers(test_operation<T>& op); + +#ifdef CCL_ENABLE_SYCL + void copy_to_device_send_buffers(test_operation<T>& op); + void copy_from_device_recv_buffers(test_operation<T>& op); +#endif /* CCL_ENABLE_SYCL */ virtual T calculate_reduce_value(test_operation<T>& op, size_t buf_idx, size_t elem_idx); diff --git a/tests/functional/base_impl.hpp b/tests/functional/test_impl.hpp similarity index 65% rename from tests/functional/base_impl.hpp rename to tests/functional/test_impl.hpp index f8a6bb672..b37a85d0e 100644 --- a/tests/functional/base_impl.hpp +++ b/tests/functional/test_impl.hpp @@ -17,12 +17,33 @@ #include <math.h> -#include "base.hpp" #include "lp.hpp" +#include "test.hpp" +#include "transport.hpp" #define FIRST_FP_COEFF (0.1) #define SECOND_FP_COEFF (0.01) +#ifdef CCL_ENABLE_SYCL +void* alloc_buffer(size_t bytes) { + auto& allocator = transport_data::instance().get_allocator(); + return allocator.allocate(bytes, sycl::usm::alloc::device); +} + +void free_buffer(void* ptr) { + auto& allocator = transport_data::instance().get_allocator(); + allocator.deallocate(static_cast<char*>(ptr)); +} + +void copy_buffer(void* dst, void* src, size_t bytes) { + transport_data::instance().get_stream().get_native().memcpy(dst, src, bytes).wait(); +} + +void fill_buffer(void* ptr, int value, size_t bytes) { + transport_data::instance().get_stream().get_native().memset(ptr, value, bytes).wait(); +} +#endif /* CCL_ENABLE_SYCL */ + template <typename T> template <class coll_attr_type> void test_operation<T>::prepare_attr(coll_attr_type& attr, size_t idx) { @@ -118,24 +139,6 @@ bool test_operation<T>::complete_event(ccl::event& e) { } } -template <typename T> -void test_operation<T>::change_buffer_pointers() { - char* dynamic_pointer_env = getenv("CCL_TEST_DYNAMIC_POINTER"); - if (dynamic_pointer_env && atoi(dynamic_pointer_env) == 1) { - /* - create deep copy of vector with buffers and swap it with original one - as result buffers in updated vector will have original content - but in new memory locations - */ - if (comm_rank % 2) { - std::vector<std::vector<T>>(send_bufs.begin(), send_bufs.end()).swap(send_bufs); - } - else { - std::vector<std::vector<T>>(recv_bufs.begin(), recv_bufs.end()).swap(recv_bufs); - } - } -} - template <typename T> size_t test_operation<T>::generate_priority_value(size_t buf_idx) { return buf_idx++; @@ -152,8 +155,6 @@ void test_operation<T>::print(std::ostream& output) { template <typename T> base_test<T>::base_test() { - global_comm_rank = global_data::instance().comms[0].rank(); - global_comm_size = global_data::instance().comms[0].size(); memset(err_message, '\0', ERR_MESSAGE_MAX_LEN); rand_engine = std::default_random_engine{ rand_device() }; } @@ -204,39 +205,41 @@ int base_test<T>::check_error(test_operation<T>& op, T expected, size_t buf_idx, return TEST_SUCCESS; } +template <typename T> +void base_test<T>::free_buffers(test_operation<T>& op) { + op.send_bufs.clear(); + op.recv_bufs.clear(); + +#ifdef CCL_ENABLE_SYCL + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + free_buffer(op.device_send_bufs[buf_idx]); + free_buffer(op.device_recv_bufs[buf_idx]); + } +#endif /* CCL_ENABLE_SYCL */ +} + template <typename T> void base_test<T>::alloc_buffers_base(test_operation<T>& op) { op.send_bufs.resize(op.buffer_count); op.recv_bufs.resize(op.buffer_count); - if (is_lp_datatype(op.param.datatype)) { - op.send_bufs_lp.resize(op.buffer_count); - op.recv_bufs_lp.resize(op.buffer_count); - } - for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { op.send_bufs[buf_idx].resize(op.elem_count * op.comm_size); op.recv_bufs[buf_idx].resize(op.elem_count * op.comm_size); + } - if (is_lp_datatype(op.param.datatype)) { - op.send_bufs_lp[buf_idx].resize(op.elem_count * op.comm_size); - op.recv_bufs_lp[buf_idx].resize(op.elem_count * op.comm_size); - } +#ifdef CCL_ENABLE_SYCL + op.device_send_bufs.resize(op.buffer_count); + op.device_recv_bufs.resize(op.buffer_count); + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + op.device_send_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size); + op.device_recv_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size); } +#endif /* CCL_ENABLE_SYCL */ } template <typename T> void base_test<T>::alloc_buffers(test_operation<T>& op) {} -template <typename T> -void base_test<T>::fill_send_buffers_base(test_operation<T>& op) { - if (!is_lp_datatype(op.param.datatype)) - return; - - for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { - std::fill(op.send_bufs_lp[buf_idx].begin(), op.send_bufs_lp[buf_idx].end(), (T)SOME_VALUE); - } -} - template <typename T> void base_test<T>::fill_send_buffers(test_operation<T>& op) { for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { @@ -251,7 +254,7 @@ void base_test<T>::fill_send_buffers(test_operation<T>& op) { } template <typename T> -void base_test<T>::fill_recv_buffers_base(test_operation<T>& op) { +void base_test<T>::fill_recv_buffers(test_operation<T>& op) { for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { if (op.param.place_type == PLACE_IN) { std::copy(op.send_bufs[buf_idx].begin(), @@ -261,15 +264,9 @@ void base_test<T>::fill_recv_buffers_base(test_operation<T>& op) { else { std::fill(op.recv_bufs[buf_idx].begin(), op.recv_bufs[buf_idx].end(), (T)SOME_VALUE); } - if (is_lp_datatype(op.param.datatype)) { - std::fill(op.recv_bufs_lp[buf_idx].begin(), op.recv_bufs_lp[buf_idx].end(), SOME_VALUE); - } } } -template <typename T> -void base_test<T>::fill_recv_buffers(test_operation<T>& op) {} - template <typename T> T base_test<T>::calculate_reduce_value(test_operation<T>& op, size_t buf_idx, size_t elem_idx) { T expected = 0; @@ -329,50 +326,142 @@ float base_test<float>::calculate_reduce_value(test_operation<float>& op, return expected; } +template <typename T> +void base_test<T>::change_buffers(test_operation<T>& op) { + char* dynamic_pointer_env = getenv("CCL_TEST_DYNAMIC_POINTER"); + if (dynamic_pointer_env && atoi(dynamic_pointer_env) == 1) { + void* send_buf = op.send_bufs[0].data(); + void* recv_buf = op.recv_bufs[0].data(); + /* + create deep copy of vector with buffers and swap it with original one + as result buffers in updated vector will have original content + but in new memory locations + */ + std::vector<std::vector<T>>(op.send_bufs.begin(), op.send_bufs.end()).swap(op.send_bufs); + std::vector<std::vector<T>>(op.recv_bufs.begin(), op.recv_bufs.end()).swap(op.recv_bufs); + void* new_send_buf = op.send_bufs[0].data(); + void* new_recv_buf = op.recv_bufs[0].data(); + ASSERT(send_buf != new_send_buf, "send buffers should differ"); + ASSERT(recv_buf != new_recv_buf, "recv buffers should differ"); + +#ifdef CCL_ENABLE_SYCL + /* do regular reallocation */ + void* device_send_buf = op.device_send_bufs[0]; + void* device_recv_buf = op.device_recv_bufs[0]; + std::vector<void*> new_device_send_bufs(op.buffer_count); + std::vector<void*> new_device_recv_bufs(op.buffer_count); + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + new_device_send_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size); + new_device_recv_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size); + } + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + free_buffer(op.device_send_bufs[buf_idx]); + free_buffer(op.device_recv_bufs[buf_idx]); + op.device_send_bufs[buf_idx] = new_device_send_bufs[buf_idx]; + op.device_recv_bufs[buf_idx] = new_device_recv_bufs[buf_idx]; + } + void* new_device_send_buf = op.device_send_bufs[0]; + void* new_device_recv_buf = op.device_recv_bufs[0]; + ASSERT(device_send_buf != new_device_send_buf, "device send buffers should differ"); + ASSERT(device_recv_buf != new_device_recv_buf, "device recv buffers should differ"); +#endif /* CCL_ENABLE_SYCL */ + } +} + +#ifdef CCL_ENABLE_SYCL + +template <typename T> +void base_test<T>::copy_to_device_send_buffers(test_operation<T>& op) { + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { +#ifdef TEST_CCL_BCAST + void* host_buf = op.recv_bufs[buf_idx].data(); + void* device_buf = op.device_recv_bufs[buf_idx]; +#else /* TEST_CCL_BCAST */ + void* host_buf = (op.param.place_type == PLACE_IN) ? op.recv_bufs[buf_idx].data() + : op.send_bufs[buf_idx].data(); + void* device_buf = (op.param.place_type == PLACE_IN) ? op.device_recv_bufs[buf_idx] + : op.device_send_bufs[buf_idx]; +#endif /* TEST_CCL_BCAST */ + size_t bytes = op.send_bufs[buf_idx].size() * sizeof(T); + copy_buffer(device_buf, host_buf, bytes); + } +} + +template <typename T> +void base_test<T>::copy_from_device_recv_buffers(test_operation<T>& op) { + for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) { + copy_buffer(op.recv_bufs[buf_idx].data(), + op.device_recv_bufs[buf_idx], + op.recv_bufs[buf_idx].size() * sizeof(T)); + } +} +#endif /* CCL_ENABLE_SYCL */ + template <typename T> int base_test<T>::run(test_operation<T>& op) { - size_t result = 0; + size_t iter = 0, result = 0; char* algo = getenv(ALGO_SELECTION_ENV); if (algo) std::cout << ALGO_SELECTION_ENV << " = " << algo << "\n"; std::cout << op.param << "\n"; - for (size_t iter = 0; iter < ITER_COUNT; iter++) { - try { - alloc_buffers_base(op); - alloc_buffers(op); - - fill_send_buffers_base(op); - fill_send_buffers(op); - - fill_recv_buffers_base(op); - fill_recv_buffers(op); - + /* + Buffer management logic for single operation + SYCL-specific logic is marked with (*) + LP-specific logic is marked with (**) + + 1. alloc host send and recv buffers + 2. alloc device send and recv buffers (*) + 3. fill host send and recv buffers + 4. do in-place FP32->LP cast for host send buffer (**) + 5. copy from host send buffer into device send buffer (*) + 6. invoke comm operation on host or device (*) send and recv buffers + 7. copy device recv buffer into host recv buffer (*) + 8. do in-place LP->FP32 cast for host recv buffer (**) + 9. check result correctness on host recv buffer + 10. free host send and recv buffers + 11. free device send and recv buffers (*) + */ + + try { + alloc_buffers_base(op); + alloc_buffers(op); + for (iter = 0; iter < ITER_COUNT; iter++) { if (iter > 0) { - op.change_buffer_pointers(); + change_buffers(op); } - op.define_start_order(rand_engine); + fill_send_buffers(op); + fill_recv_buffers(op); if (is_lp_datatype(op.param.datatype)) { make_lp_prologue(op, op.comm_size * op.elem_count); } - run_derived(op); +#ifdef CCL_ENABLE_SYCL + copy_to_device_send_buffers(op); +#endif /* CCL_ENABLE_SYCL */ + op.define_start_order(rand_engine); + run_derived(op); op.complete_events(); +#ifdef CCL_ENABLE_SYCL + copy_from_device_recv_buffers(op); +#endif /* CCL_ENABLE_SYCL */ + if (is_lp_datatype(op.param.datatype)) { make_lp_epilogue(op, op.comm_size * op.elem_count); } result += check(op); } - catch (const std::exception& ex) { - result += TEST_FAILURE; - printf("WARNING! %s iter number: %zu", ex.what(), iter); - } + free_buffers(op); + } + catch (const std::exception& ex) { + result += TEST_FAILURE; + printf("WARNING! %s iter number: %zu", ex.what(), iter); } return result; diff --git a/tests/functional/transport.cpp b/tests/functional/transport.cpp new file mode 100644 index 000000000..b6b29a9bd --- /dev/null +++ b/tests/functional/transport.cpp @@ -0,0 +1,144 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include <mpi.h> + +#ifdef CCL_ENABLE_SYCL +#include <CL/sycl.hpp> +#endif /* CCL_ENABLE_SYCL */ + +#include "transport.hpp" + +transport_data::transport_data() { + init_by_mpi(); + + service_comms.push_back(ccl::create_communicator(size, rank, kvs)); +} + +transport_data::~transport_data() { + deinit_by_mpi(); +} + +transport_data& transport_data::instance() { + static transport_data inst; + return inst; +} + +void transport_data::init_by_mpi() { + ccl::init(); + + MPI_Init(NULL, NULL); + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + ccl::shared_ptr_class<ccl::kvs> kvs_candidate; + ccl::kvs::address_type main_addr; + if (rank == 0) { + kvs_candidate = ccl::create_main_kvs(); + main_addr = kvs_candidate->get_address(); + MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); + } + else { + MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); + kvs_candidate = ccl::create_kvs(main_addr); + } + kvs = kvs_candidate; + init_comms(); +} + +void transport_data::deinit_by_mpi() { + int is_finalized = 0; + MPI_Finalized(&is_finalized); + if (!is_finalized) { + MPI_Finalize(); + } +} + +void transport_data::init_comms() { + std::vector<int> local_ranks; + for (int idx = 0; idx < ranks_per_proc; idx++) { + local_ranks.push_back(rank * ranks_per_proc + idx); + } + + ccl::context context = ccl::create_context(); + std::vector<ccl::device> devices; + std::map<int, ccl::device> r2d_map; + +#ifdef CCL_ENABLE_SYCL + auto sycl_queues = create_sycl_queues("gpu", local_ranks); + ASSERT(!sycl_queues.empty(), "queues should contain at least one queue"); + ASSERT(ranks_per_proc == sycl_queues.size(), "ranks and queues sizes should match"); + + auto sycl_context = sycl_queues[0].get_context(); + context = ccl::create_context(sycl_context); + + for (int idx = 0; idx < ranks_per_proc; idx++) { + streams.push_back(ccl::create_stream(sycl_queues[idx])); + devices.push_back(ccl::create_device(sycl_queues[idx].get_device())); + allocators.push_back(buf_allocator<char>(streams[0].get_native())); + } +#else /* CCL_ENABLE_SYCL */ + for (int idx = 0; idx < ranks_per_proc; idx++) { + streams.push_back(ccl::create_stream()); + devices.push_back(ccl::create_device()); + } +#endif /* CCL_ENABLE_SYCL */ + + for (int idx = 0; idx < ranks_per_proc; idx++) { + r2d_map.emplace(local_ranks[idx], devices[idx]); + } + + comms = ccl::create_communicators(size * ranks_per_proc, r2d_map, context, kvs); + + ASSERT((int)comms.size() == ranks_per_proc, + "unexpected comms size %zu, expected %d", + comms.size(), + ranks_per_proc); +} + +void transport_data::reset_comms() { + comms.clear(); + service_comms.clear(); +} + +int transport_data::get_rank() const noexcept { + return rank; +} + +int transport_data::get_size() const noexcept { + return size; +} + +ccl::shared_ptr_class<ccl::kvs> transport_data::get_kvs() { + return kvs; +} + +ccl::communicator& transport_data::get_comm() { + return comms[0]; +} + +ccl::communicator& transport_data::get_service_comm() { + return service_comms[0]; +} + +ccl::stream& transport_data::get_stream() { + return streams[0]; +} + +#ifdef CCL_ENABLE_SYCL +buf_allocator<char>& transport_data::get_allocator() { + return allocators[0]; +} +#endif /* CCL_ENABLE_SYCL */ diff --git a/tests/functional/transport.hpp b/tests/functional/transport.hpp new file mode 100644 index 000000000..f7287064e --- /dev/null +++ b/tests/functional/transport.hpp @@ -0,0 +1,66 @@ +/* + Copyright 2016-2020 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#pragma once + +#include <map> +#include <vector> + +#include "base.hpp" +#include "oneapi/ccl.hpp" +#ifdef CCL_ENABLE_SYCL +#include "sycl_base.hpp" +#endif /* CCL_ENABLE_SYCL */ + +class transport_data { +public: + static transport_data& instance(); + + void init_comms(); + void reset_comms(); + + int get_rank() const noexcept; + int get_size() const noexcept; + + ccl::shared_ptr_class<ccl::kvs> get_kvs(); + ccl::communicator& get_comm(); + ccl::communicator& get_service_comm(); + ccl::stream& get_stream(); + +#ifdef CCL_ENABLE_SYCL + buf_allocator<char>& get_allocator(); +#endif /* CCL_ENABLE_SYCL */ + +private: + transport_data(); + ~transport_data(); + + void init_by_mpi(); + void deinit_by_mpi(); + + int rank; + int size; + + ccl::shared_ptr_class<ccl::kvs> kvs; + std::vector<ccl::communicator> comms; + std::vector<ccl::communicator> service_comms; + std::vector<ccl::stream> streams; + +#ifdef CCL_ENABLE_SYCL + std::vector<buf_allocator<char>> allocators; +#endif /* CCL_ENABLE_SYCL */ + + const int ranks_per_proc = 1; +}; diff --git a/tests/functional/utils.hpp b/tests/functional/utils.hpp index 0298123ca..494381865 100644 --- a/tests/functional/utils.hpp +++ b/tests/functional/utils.hpp @@ -22,7 +22,6 @@ #include <sstream> #include <stdlib.h> #include <string> -#include <sys/syscall.h> #include "gtest/gtest.h" @@ -38,48 +37,24 @@ #define ITER_COUNT 2 #define ERR_MESSAGE_MAX_LEN 180 -#define TIMEOUT 30 - -#define GETTID() syscall(SYS_gettid) +#define TIMEOUT 30 #define UNUSED_ATTR __attribute__((unused)) #define TEST_SUCCESS 0 #define TEST_FAILURE 1 -#if 0 - +#ifndef PRINT #define PRINT(fmt, ...) \ do { \ fflush(stdout); \ printf("\n(%ld): %s: " fmt "\n", GETTID(), __FUNCTION__, ##__VA_ARGS__); \ fflush(stdout); \ } while (0) - -#define PRINT_BUFFER(buf, bufSize, prefix) \ - do { \ - std::string strToPrint; \ - for (size_t idx = 0; idx < bufSize; idx++) { \ - strToPrint += std::to_string(buf[idx]); \ - if (idx != bufSize - 1) \ - strToPrint += ", "; \ - } \ - strToPrint = std::string(prefix) + strToPrint; \ - PRINT("%s", strToPrint.c_str()); \ - } while (0) - -#else /* ENABLE_DEBUG */ - -#define PRINT(fmt, ...) \ - {} -#define PRINT_BUFFER(buf, bufSize, prefix) \ - {} - -#endif /* ENABLE_DEBUG */ +#endif /* PRINT */ #define OUTPUT_NAME_ARG "--gtest_output=" -#define PATCH_OUTPUT_NAME_ARG(argc, argv) \ +#define PATCH_OUTPUT_NAME_ARG(argc, argv, comm) \ do { \ - auto& comm = gd.comms[0]; \ if (comm.size() > 1) { \ for (int idx = 1; idx < argc; idx++) { \ if (strstr(argv[idx], OUTPUT_NAME_ARG)) { \ @@ -115,8 +90,8 @@ int result = className.run(op); \ int result_final = 0; \ static int glob_idx = 0; \ - auto& comm = global_data::instance().comms[0]; \ - ccl::allreduce(&result, &result_final, 1, ccl::reduction::sum, comm).wait(); \ + auto& service_comm = transport_data::instance().get_service_comm(); \ + ccl::allreduce(&result, &result_final, 1, ccl::reduction::sum, service_comm).wait(); \ if (result_final > 0) { \ print_err_message(className.get_err_message(), output); \ if (op.comm_rank == 0) { \ @@ -139,50 +114,15 @@ return TEST_SUCCESS; \ } -#define ASSERT(cond, fmt, ...) \ - do { \ - if (!(cond)) { \ - fprintf(stderr, \ - "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \ - GETTID(), \ - __FILE__, \ - __FUNCTION__, \ - __LINE__, \ - #cond, \ - ##__VA_ARGS__); \ - fflush(stderr); \ - exit(0); \ - } \ - } while (0) - #define MAIN_FUNCTION() \ int main(int argc, char** argv, char* envs[]) { \ init_test_params(); \ - ccl::init(); \ - int mpi_inited = 0; \ - MPI_Initialized(&mpi_inited); \ - if (!mpi_inited) { \ - MPI_Init(NULL, NULL); \ - } \ - atexit(mpi_finalize); \ - int size, rank; \ - MPI_Comm_size(MPI_COMM_WORLD, &size); \ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); \ - ccl::kvs::address_type main_addr; \ - auto& gd = global_data::instance(); \ - if (rank == 0) { \ - gd.kvs = ccl::create_main_kvs(); \ - main_addr = gd.kvs->get_address(); \ - MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); \ - } \ - else { \ - MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); \ - gd.kvs = ccl::create_kvs(main_addr); \ - } \ - gd.comms.emplace_back(ccl::create_communicator(size, rank, gd.kvs)); \ - PATCH_OUTPUT_NAME_ARG(argc, argv); \ + auto& transport = transport_data::instance(); \ + auto& service_comm = transport.get_service_comm(); \ + PATCH_OUTPUT_NAME_ARG(argc, argv, service_comm); \ testing::InitGoogleTest(&argc, argv); \ int res = RUN_ALL_TESTS(); \ + transport.reset_comms(); \ return res; \ } diff --git a/tests/functional/googletest-release-1.8.1/CMakeLists.txt b/tests/googletest-release-1.8.1/CMakeLists.txt similarity index 100% rename from tests/functional/googletest-release-1.8.1/CMakeLists.txt rename to tests/googletest-release-1.8.1/CMakeLists.txt diff --git a/tests/functional/googletest-release-1.8.1/CONTRIBUTING.md b/tests/googletest-release-1.8.1/CONTRIBUTING.md similarity index 100% rename from tests/functional/googletest-release-1.8.1/CONTRIBUTING.md rename to tests/googletest-release-1.8.1/CONTRIBUTING.md diff --git a/tests/functional/googletest-release-1.8.1/LICENSE b/tests/googletest-release-1.8.1/LICENSE similarity index 100% rename from tests/functional/googletest-release-1.8.1/LICENSE rename to tests/googletest-release-1.8.1/LICENSE diff --git a/tests/functional/googletest-release-1.8.1/Makefile.am b/tests/googletest-release-1.8.1/Makefile.am similarity index 100% rename from tests/functional/googletest-release-1.8.1/Makefile.am rename to tests/googletest-release-1.8.1/Makefile.am diff --git a/tests/functional/googletest-release-1.8.1/README.md b/tests/googletest-release-1.8.1/README.md similarity index 100% rename from tests/functional/googletest-release-1.8.1/README.md rename to tests/googletest-release-1.8.1/README.md diff --git a/tests/functional/googletest-release-1.8.1/googletest/CMakeLists.txt b/tests/googletest-release-1.8.1/googletest/CMakeLists.txt similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/CMakeLists.txt rename to tests/googletest-release-1.8.1/googletest/CMakeLists.txt diff --git a/tests/functional/googletest-release-1.8.1/googletest/Makefile.am b/tests/googletest-release-1.8.1/googletest/Makefile.am similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/Makefile.am rename to tests/googletest-release-1.8.1/googletest/Makefile.am diff --git a/tests/functional/googletest-release-1.8.1/googletest/README.md b/tests/googletest-release-1.8.1/googletest/README.md similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/README.md rename to tests/googletest-release-1.8.1/googletest/README.md diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/Config.cmake.in b/tests/googletest-release-1.8.1/googletest/cmake/Config.cmake.in similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/cmake/Config.cmake.in rename to tests/googletest-release-1.8.1/googletest/cmake/Config.cmake.in diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/gtest.pc.in b/tests/googletest-release-1.8.1/googletest/cmake/gtest.pc.in similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/cmake/gtest.pc.in rename to tests/googletest-release-1.8.1/googletest/cmake/gtest.pc.in diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in b/tests/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in rename to tests/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake b/tests/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake rename to tests/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-all.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-all.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-all.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-all.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-death-test.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-death-test.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-death-test.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-death-test.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-filepath.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-filepath.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-filepath.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-filepath.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h b/tests/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h rename to tests/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-port.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-port.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-port.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-port.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-printers.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-printers.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-printers.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-printers.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-test-part.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-test-part.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-test-part.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-test-part.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest.cc b/tests/googletest-release-1.8.1/googletest/src/gtest.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest.cc diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest_main.cc b/tests/googletest-release-1.8.1/googletest/src/gtest_main.cc similarity index 100% rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest_main.cc rename to tests/googletest-release-1.8.1/googletest/src/gtest_main.cc diff --git a/third-party-programs.txt b/third-party-programs.txt index 606400830..274a06f11 100644 --- a/third-party-programs.txt +++ b/third-party-programs.txt @@ -1,5 +1,5 @@ Intel(R) oneAPI Collective Communications Library (oneCCL) -2021.2.0 Third Party Programs File +2021.3.0 Third Party Programs File This file is the "third-party-programs.txt" file specified in the associated Intel end user license agreement for the Intel software you are licensing. @@ -209,7 +209,56 @@ Software. ------------------------------------------------------------------------------- -5. Googletest +5. The Portable Hardware Locality (hwloc) + + Copyright © 2004-2006 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. + Copyright © 2004-2005 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. + Copyright © 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart. All rights reserved. + Copyright © 2004-2005 The Regents of the University of California. All rights reserved. + Copyright © 2009 CNRS + Copyright © 2009-2016 Inria. All rights reserved. + Copyright © 2009-2015 Université Bordeaux + Copyright © 2009-2015 Cisco Systems, Inc. All rights reserved. + Copyright © 2009-2012 Oracle and/or its affiliates. All rights reserved. + Copyright © 2010 IBM + Copyright © 2010 Jirka Hladky + Copyright © 2012 Aleksej Saushev, The NetBSD Foundation + Copyright © 2012 Blue Brain Project, EPFL. All rights reserved. + Copyright © 2013-2014 University of Wisconsin-La Crosse. All rights reserved. + Copyright © 2015 Research Organization for Information Science and Technology (RIST). All rights reserved. + Copyright © 2015-2016 Intel, Inc. All rights reserved. + See COPYING in top-level directory. + + The 3-Clause BSD License + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + - The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +6. Googletest Copyright 2008, Google Inc. All rights reserved.