diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac4bde264..a93e6463e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,8 +29,8 @@ include(${PROJECT_SOURCE_DIR}/cmake/helpers.cmake)
 
 check_compiler_version()
 
-#set default build types.
-#Available build types are: Debug, Release, RelWithDebInfo and MinSizeRel
+#set default build type
+#available build types are: Debug, Release, RelWithDebInfo and MinSizeRel
 if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
@@ -46,7 +46,8 @@ endif()
 
 option(BUILD_EXAMPLES "Build examples" TRUE)
 option(BUILD_FT "Build functional tests" TRUE)
-option(BUILD_UT "Build unit tests" TRUE)
+option(BUILD_UT "Build unit tests" FALSE)
+option(BUILD_CONFIG "Build cmake configs" TRUE)
 
 option(USE_CODECOV_FLAGS "Calculate code coverage" FALSE)
 option(WITH_ASAN "Use address sanitizer, can only be used in Debug build" FALSE)
@@ -66,6 +67,7 @@ message(STATUS "CXX compiler : ${CMAKE_CXX_COMPILER}")
 message(STATUS "Build examples: ${BUILD_EXAMPLES}")
 message(STATUS "Build functional tests: ${BUILD_FT}")
 message(STATUS "Build unit tests: ${BUILD_UT}")
+message(STATUS "Build cmake configs: ${BUILD_CONFIG}")
 
 add_definitions(-DCCL_C_COMPILER="${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 add_definitions(-DCCL_CXX_COMPILER="${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
@@ -89,17 +91,33 @@ set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/kernels")
 
 set(CCL_UNIT_TESTS_BUILD "${CMAKE_BINARY_DIR}/tests/unit")
 
-set(MPI_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/mpi/include/")
-set(MPI_LIB_DIR "${PROJECT_SOURCE_DIR}/mpi/lib/")
+
+# setup dependency directories
+
+set(DEPS_DIR "${PROJECT_SOURCE_DIR}/deps")
+
+set(MPI_INCLUDE_DIR "${DEPS_DIR}/mpi/include/")
+set(MPI_LIB_DIR "${DEPS_DIR}/mpi/lib/")
 if ( "${LIBFABRIC_DIR}" STREQUAL "")
-    set(LIBFABRIC_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/ofi/include")
-    set(LIBFABRIC_LIB_DIR "${PROJECT_SOURCE_DIR}/ofi/lib/")
+    set(LIBFABRIC_INCLUDE_DIR "${DEPS_DIR}/ofi/include")
+    set(LIBFABRIC_LIB_DIR "${DEPS_DIR}/ofi/lib/")
 else()
     set(LIBFABRIC_INCLUDE_DIR "${LIBFABRIC_DIR}/include/")
     set(LIBFABRIC_LIB_DIR "${LIBFABRIC_DIR}/lib")
 endif()
+set(HWLOC_INCLUDE_DIR "${DEPS_DIR}/hwloc/include/")
+set(HWLOC_LIB_DIR "${DEPS_DIR}/hwloc/lib/")
+
+message(STATUS "MPI_INCLUDE_DIR: ${MPI_INCLUDE_DIR}")
+message(STATUS "MPI_LIB_DIR: ${MPI_LIB_DIR}")
+message(STATUS "LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}")
+message(STATUS "LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}")
+message(STATUS "HWLOC_INCLUDE_DIR: ${HWLOC_INCLUDE_DIR}")
+message(STATUS "HWLOC_LIB_DIR: ${HWLOC_LIB_DIR}")
+
 include_directories(${MPI_INCLUDE_DIR})
 include_directories(${LIBFABRIC_INCLUDE_DIR})
+
 link_directories(${MPI_LIB_DIR})
 link_directories(${LIBFABRIC_LIB_DIR})
 
@@ -114,26 +132,31 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1)
     set(C_COMPILER_FLAGS "-std=gnu99")
 endif()
 
+# special flags for CCL library only
+set(SRC_C_FLAGS "")
+set(SRC_CXX_FLAGS "")
+set(SRC_SHARED_LINKER_FLAGS "")
+
 #common settings of security options
 if(USE_SECURITY_FLAGS)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fPIE -fPIC -z noexecstack -z relro -z now")
+    set(SRC_C_FLAGS "${SRC_C_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
+    set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -Wformat -Wformat-security -D_FORTIFY_SOURCE=2 -fstack-protector")
+    set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -fPIE -fPIC -z noexecstack -z relro -z now")
     if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
         if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstack-protector-strong")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
+            set(SRC_C_FLAGS "${SRC_C_FLAGS} -fstack-protector-strong")
+            set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -fstack-protector-strong")
         endif()
     endif()
 endif()
 
-set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--version-script=${PROJECT_SOURCE_DIR}/ccl.map")
+set(SRC_SHARED_LINKER_FLAGS "${SRC_SHARED_LINKER_FLAGS} -Wl,--version-script=${PROJECT_SOURCE_DIR}/ccl.map")
 
 if(${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-	if (USE_CODECOV_FLAGS)
-		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
-		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
-	endif()
+    if (USE_CODECOV_FLAGS)
+        set(SRC_C_FLAGS "${SRC_C_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
+        set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
+    endif()
 endif()
 
 #TODO: add -Wextra to c/cxx flags
@@ -155,31 +178,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 set(TRY_ENABLE_SYCL_L0 ON)
 
+set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/cmake)
 if (COMPUTE_BACKEND)
-    activate_compute_backend("${CMAKE_CURRENT_LIST_DIR}/cmake" ${COMPUTE_BACKEND})
-    if (NOT COMPUTE_BACKEND_TARGET_NAME)
-        message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_BACKEND}")
-    endif()
-    message(STATUS "COMPUTE_BACKEND_TARGET_NAME: ${COMPUTE_BACKEND_TARGET_NAME}")
-
-	if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero")
-		option (CCL_ENABLE_SYCL "Enable CCL SYCL runtime" ON)
-		message(STATUS "Enable CCL SYCL runtime")
-		execute_process(COMMAND dpcpp -v
-			OUTPUT_VARIABLE DPCPP_VERSION
-			ERROR_VARIABLE DPCPP_VERSION
-			OUTPUT_STRIP_TRAILING_WHITESPACE
-			ERROR_STRIP_TRAILING_WHITESPACE
-		)
-		message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}")
-	endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}")
-    if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "ze_loader")
-        set(MULTI_GPU_SUPPORT ON)
-    endif()
-    if (MULTI_GPU_SUPPORT)
-        message(STATUS "Enable multi GPU support using L0")
-    endif()
+    message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND}")
+    set_compute_backend(${COMMON_CMAKE_DIR})
 endif()
 
 if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
@@ -189,6 +191,15 @@ if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "
     endif()
 endif()
 
+# Clang doesn't automatically detects ninja processes as supporting colored output
+# due to the way they are spawned. In order to fix the issue we need to use the option
+# to force colored output
+if(${CMAKE_GENERATOR} STREQUAL "Ninja")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+        add_compile_options(-fcolor-diagnostics)
+    endif()
+endif()
+
 if(WITH_ASAN AND ${CMAKE_BUILD_TYPE_CASE_INSENSITIVE} STREQUAL "debug")
     message(STATUS "Compiling with address sanitizer")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fno-omit-frame-pointer")
@@ -203,7 +214,7 @@ set(CCL_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src)
 enable_testing()
 
 set(EXTERNAL_LIBS "")
-set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${PROJECT_SOURCE_DIR}/mpi/include)
+set(EXAMPLES_INC_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/examples/include ${MPI_INCLUDE_DIR})
 
 # allow `deprecated`
 set(CMAKE_CLANG_FLAGS "${CMAKE_CLANG_FLAGS}")
@@ -223,12 +234,14 @@ install(PROGRAMS ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CCL_INSTALL_LICENSE
 
 # copy kernels
 if(COMPUTE_BACKEND AND EXISTS "${PROJECT_SOURCE_DIR}/src/kernels")
-    file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/ring_*.spv")
-    install(PROGRAMS ${spv_kernels} DESTINATION ${CCL_INSTALL_KERNELS})
+file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/ring_*.spv")
+    install(PROGRAMS ${spv_kernels}
+            DESTINATION ${CCL_INSTALL_KERNELS}
+            PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ)
 endif()
 
 set(CCL_MAJOR_VERSION     "2021")
-set(CCL_MINOR_VERSION     "2")
+set(CCL_MINOR_VERSION     "3")
 set(CCL_UPDATE_VERSION    "0")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
@@ -238,16 +251,16 @@ configure_file(${PROJECT_SOURCE_DIR}/include/oneapi/ccl/config.h.in "${CMAKE_CUR
 file(COPY "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/config.h" DESTINATION ${PROJECT_SOURCE_DIR}/include/oneapi/ccl)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 
-#generate oneCCLConfig*.cmake
-configure_file("cmake/templates/oneCCLConfig.cmake.in"
-               "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfig.cmake"
-               COPYONLY)
-
 set(PROJECT_VERSION "${CCL_MAJOR_VERSION}.${CCL_MINOR_VERSION}.${CCL_UPDATE_VERSION}")
 
-configure_file("cmake/templates/oneCCLConfigVersion.cmake.in"
-               "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfigVersion.cmake"
-               @ONLY)
+if (BUILD_CONFIG)
+    configure_file("cmake/templates/oneCCLConfig.cmake.in"
+                   "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfig.cmake"
+                   COPYONLY)
+    configure_file("cmake/templates/oneCCLConfigVersion.cmake.in"
+                   "${CCL_INSTALL_LIB}/cmake/oneCCL/oneCCLConfigVersion.cmake"
+                   @ONLY)
+endif()
 
 #include other CMakeLists
 
@@ -267,6 +280,6 @@ if (BUILD_FT)
     add_subdirectory(tests/functional)
 endif()
 
-if (BUILD_UT)
-	#add_subdirectory(tests/unit)
+if (BUILD_UT AND EXISTS "${PROJECT_SOURCE_DIR}/tests/unit")
+    add_subdirectory(tests/unit)
 endif()
diff --git a/cmake/FindNUMA.cmake b/cmake/FindNUMA.cmake
new file mode 100644
index 000000000..9c860354e
--- /dev/null
+++ b/cmake/FindNUMA.cmake
@@ -0,0 +1,22 @@
+# Find the NUMA library and includes
+#
+# NUMA_INCLUDE_DIR - where to find numa.h
+# NUMA_LIBRARIES - list of libraries when using NUMA
+# NUMA_FOUND - true if NUMA found
+
+find_path(NUMA_INCLUDE_DIR
+  NAMES numa.h numaif.h
+  HINTS ${NUMA_ROOT_DIR}/include)
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  HINTS ${NUMA_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR)
+
+if (NUMA_FOUND)
+    message(STATUS "NUMA was found, include_dir: ${NUMA_INCLUDE_DIR}, libraries: ${NUMA_LIBRARIES}")
+else()
+    message(STATUS "NUMA was not found")
+endif()
diff --git a/cmake/ccl b/cmake/ccl
index c96106735..474562428 100644
--- a/cmake/ccl
+++ b/cmake/ccl
@@ -34,14 +34,28 @@ set componentname "[file tail "$componentroot"]"
 # get oneAPI top-level root folder
 set oneapiroot "[file dirname "$componentroot"]"
 
+# disallow loading multiple versions of this modulefile
+# disallow loading multiple architectures of this modulefile
+# if only 64-bit architecture exists the test still works
+set mname32 $modulefilename
+set mname64 [string trimright $mname32 "32"]
+if { [string equal "$mname32" "$mname64"] } {
+    append mname32 "32"
+}
+conflict $mname32
+conflict $mname64
+
+
 # On load print component name and version being loaded
 if { [ module-info mode load ] } {
     puts stderr "Loading $modulefilename"
 }
 
-# On remove print component name and version being removed
+# On `module unload` print component module name and version being removed
+# Include `module list` message only if this modulefile loads dependent modules
 if { [ module-info mode ] == "unload" || [ module-info mode ] == "remove" } {
     puts stderr "Removing $modulefilename"
+    puts stderr "Use `module list` to view any remaining dependent modules."
 }
 
 
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index aa85955e5..b24acc18b 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -6,7 +6,7 @@ function(set_lp_env)
     set(GCC_BF16_AVX512BF_MIN_SUPPORTED "10.0.0")
     set(ICC_BF16_AVX512BF_MIN_SUPPORTED "19.1.0")
     set(CLANG_BF16_MIN_SUPPORTED "9.0.0")
-    set(CLANG_BF16_AVX512BF_MIN_SUPPORTED "10.0.0")
+    set(CLANG_BF16_AVX512BF_MIN_SUPPORTED "9.3.0")
 
     if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
@@ -16,11 +16,10 @@ function(set_lp_env)
         )
         add_definitions(-DCCL_BF16_COMPILER)
         set(CCL_BF16_COMPILER ON)
-        message(STATUS "BF16 compiler: yes")
     else()
         set(CCL_BF16_COMPILER OFF)
-        message(STATUS "BF16 compiler: no")
     endif()
+    message(STATUS "BF16 compiler: ${CCL_BF16_COMPILER}")
 
     if ((${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_BF16_AVX512BF_MIN_SUPPORTED})
@@ -30,21 +29,27 @@ function(set_lp_env)
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_BF16_AVX512BF_MIN_SUPPORTED})
         )
         add_definitions(-DCCL_BF16_AVX512BF_COMPILER)
-        message(STATUS "BF16 AVX512BF compiler: yes")
+        set(CCL_BF16_AVX512BF_COMPILER ON)
     else()
-        message(STATUS "BF16 AVX512BF compiler: no")
+        set(CCL_BF16_AVX512BF_COMPILER OFF)
     endif()
+    message(STATUS "BF16 AVX512BF compiler: ${CCL_BF16_AVX512BF_COMPILER}")
 
     if (CCL_BF16_COMPILER)
         if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
             add_definitions(-DCCL_BF16_TARGET_ATTRIBUTES)
-            message(STATUS "BF16 target attributes: yes")
+            set(CCL_BF16_TARGET_ATTRIBUTES ON)
         else()
-            message(STATUS "BF16 target attributes: no")
+            set(CCL_BF16_TARGET_ATTRIBUTES OFF)
         endif()
+        message(STATUS "BF16 target attributes: ${CCL_BF16_TARGET_ATTRIBUTES}")
     endif()
 
-    set(CCL_GPU_BF16_TRUNCATE ON PARENT_SCOPE)
+    option(CCL_BF16_GPU_TRUNCATE "Truncate BF16 in GPU operations" ON)
+    if (CCL_BF16_GPU_TRUNCATE)
+        add_definitions(-DCCL_BF16_GPU_TRUNCATE)
+    endif()
+    message(STATUS "BF16 GPU truncate: ${CCL_BF16_GPU_TRUNCATE}")
 
 
     set(GCC_FP16_MIN_SUPPORTED "4.9.0")
@@ -58,30 +63,36 @@ function(set_lp_env)
         )
         add_definitions(-DCCL_FP16_COMPILER)
         set(CCL_FP16_COMPILER ON)
-        message(STATUS "FP16 compiler: yes")
     else()
         set(CCL_FP16_COMPILER OFF)
-        message(STATUS "FP16 compiler: no")
     endif()
+    message(STATUS "FP16 compiler: ${CCL_FP16_COMPILER}")
 
     if (CCL_FP16_COMPILER)
         if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
             add_definitions(-DCCL_FP16_TARGET_ATTRIBUTES)
-            message(STATUS "FP16 target attributes: yes")
+            set(CCL_FP16_TARGET_ATTRIBUTES ON)
         else()
-            message(STATUS "FP16 target attributes: no")
+            set(CCL_FP16_TARGET_ATTRIBUTES OFF)
         endif()
+        message(STATUS "FP16 target attributes: ${CCL_FP16_TARGET_ATTRIBUTES}")
+    endif()
+
+    option(CCL_FP16_GPU_TRUNCATE "Truncate FP16 in GPU operations" OFF)
+    if (CCL_FP16_GPU_TRUNCATE)
+        add_definitions(-DCCL_FP16_GPU_TRUNCATE)
     endif()
+    message(STATUS "FP16 GPU truncate: ${CCL_FP16_GPU_TRUNCATE}")
 
     set(LP_ENV_DEFINED 1 PARENT_SCOPE)
 
 endfunction(set_lp_env)
 
-
 function(check_compiler_version)
 
     set(GCC_MIN_SUPPORTED "4.8")
     set(ICC_MIN_SUPPORTED "15.0")
+    set(CLANG_MIN_SUPPORTED "9.0")
 
     if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
         if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_MIN_SUPPORTED})
@@ -91,6 +102,10 @@ function(check_compiler_version)
         if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_MIN_SUPPORTED})
             message(FATAL_ERROR "icc min supported version is ${ICC_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}")
         endif()
+    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_MIN_SUPPORTED})
+            message(FATAL_ERROR "clang min supported version is ${CLANG_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}")
+        endif()
     else()
         message(WARNING "Compilation with ${CMAKE_C_COMPILER_ID} was not tested, no warranty")
     endif()
@@ -121,7 +136,6 @@ endfunction(get_vcs_properties)
 
 
 function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
-
     string( TOLOWER "${COMPUTE_BACKEND}" COMPUTE_BACKEND)
 
     set(CCL_ENABLE_SYCL_V 0 PARENT_SCOPE)
@@ -176,7 +190,7 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
         # remember current target for `target_link_libraries` in ccl
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero)
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL_level_zero PARENT_SCOPE)
-        message ("___COMPUTE_BACKEND_TARGET_NAME=${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider")
+        message ("COMPUTE_BACKEND_TARGET_NAME=${COMPUTE_BACKEND_TARGET_NAME} requested. Using DPC++ provider")
 
     elseif(COMPUTE_BACKEND STREQUAL "level_zero")
         SET (COMPUTE_BACKEND_LOAD_MODULE "level_zero"
@@ -219,7 +233,7 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL)
         set (COMPUTE_BACKEND_TARGET_NAME Intel::SYCL PARENT_SCOPE)
     # elseif(COMPUTE_BACKEND STREQUAL "host")
-        # message ("COMPUTE_BACKEND=${COMPUTE_BACKEND} requested.")	
+        # message ("COMPUTE_BACKEND=${COMPUTE_BACKEND} requested.")
     # else()
          # message(FATAL_ERROR "Please provide one of the following compute runtime: dpcpp, level_zero, dpcpp_level_zero, host")
     endif()
@@ -230,6 +244,14 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
     get_target_property(COMPUTE_BACKEND_LIBRARIES_LOCAL
                         ${COMPUTE_BACKEND_TARGET_NAME} INTERFACE_LINK_LIBRARIES)
 
+    # When we use dpcpp compiler(dpcpp/dpcpp_level_zero backends), use c++17 to be aligned with compiler
+    if (${COMPUTE_BACKEND_TARGET_NAME} MATCHES "^Intel::SYCL.*")
+        set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    # And use c++11 for all other cases
+    else()
+        set(CMAKE_CXX_STANDARD 11 PARENT_SCOPE)
+    endif()
+
     # set output variables in the parent scope:
     # Only `COMPUTE_BACKEND_FLAGS` is actually required, because  the other flags are derived from
     # 'target_link_libraries'.
@@ -239,3 +261,48 @@ function(activate_compute_backend MODULES_PATH COMPUTE_BACKEND)
     set(COMPUTE_BACKEND_INCLUDE_DIRS ${COMPUTE_BACKEND_INCLUDE_DIRS_LOCAL}  PARENT_SCOPE)
 
 endfunction(activate_compute_backend)
+
+function(set_compute_backend COMMON_CMAKE_DIR)
+    activate_compute_backend("${COMMON_CMAKE_DIR}" ${COMPUTE_BACKEND})
+
+    # When we use dpcpp compiler(dpcpp/dpcpp_level_zero backends), use c++17 to be aligned with compiler
+    # Although the same thing is done in activate_compute_backend we need to set the variable here as
+    # well bacause both set_compute_backend and activate_compute_backend can be called directly
+    if (${COMPUTE_BACKEND_TARGET_NAME} MATCHES "^Intel::SYCL.*")
+        set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    # And use c++11 for all other cases
+    else()
+        set(CMAKE_CXX_STANDARD 11 PARENT_SCOPE)
+    endif()
+
+    if (NOT COMPUTE_BACKEND_TARGET_NAME)
+        message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_BACKEND} in ${COMMON_CMAKE_DIR}")
+    endif()
+    message(STATUS "COMPUTE_BACKEND_TARGET_NAME: ${COMPUTE_BACKEND_TARGET_NAME}")
+
+    if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero")
+        option (CCL_ENABLE_SYCL "Enable CCL SYCL runtime" ON)
+        message(STATUS "Enable CCL SYCL runtime")
+        execute_process(COMMAND dpcpp -v
+            OUTPUT_VARIABLE DPCPP_VERSION
+            ERROR_VARIABLE DPCPP_VERSION
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_STRIP_TRAILING_WHITESPACE
+        )
+        message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}")
+    if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "ze_loader")
+        set(MULTI_GPU_SUPPORT ON PARENT_SCOPE)
+        set(MULTI_GPU_SUPPORT ON)
+    endif()
+    if (MULTI_GPU_SUPPORT)
+        message(STATUS "Enable multi GPU support using L0")
+    endif()
+
+    # need to pass these variables to overlying function
+    set (COMPUTE_BACKEND_TARGET_NAME ${COMPUTE_BACKEND_TARGET_NAME} PARENT_SCOPE)
+    set (COMPUTE_BACKEND_FLAGS ${COMPUTE_BACKEND_FLAGS} PARENT_SCOPE)
+    set (COMPUTE_BACKEND_LIBRARIES ${COMPUTE_BACKEND_LIBRARIES} PARENT_SCOPE)
+    set (COMPUTE_BACKEND_FLAGS ${COMPUTE_BACKEND_FLAGS} PARENT_SCOPE)
+endfunction(set_compute_backend)
diff --git a/cmake/setvars.sh.in b/cmake/setvars.sh.in
index 7a610a2f6..f061e4f9d 100644
--- a/cmake/setvars.sh.in
+++ b/cmake/setvars.sh.in
@@ -16,11 +16,8 @@
 #
 
 WORK_DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null && pwd)"
-export CCL_ROOT="$(cd ${WORK_DIR}/../; pwd -P)"
-if [ -z "${I_MPI_ROOT}" ]
-then
-    export I_MPI_ROOT="${CCL_ROOT}"
-fi
+CCL_ROOT="$(cd ${WORK_DIR}/../; pwd -P)"
+export I_MPI_ROOT="${CCL_ROOT}"
 
 source ${CCL_ROOT}/env/vars.sh $1
 
@@ -31,4 +28,4 @@ else
     PATH="${CCL_ROOT}/bin:${PATH}"; export PATH
 fi
 
-FI_PROVIDER_PATH="${CCL_ROOT}/@CMAKE_INSTALL_LIBDIR@/prov"; export FI_PROVIDER_PATH
+FI_PROVIDER_PATH="${CCL_ROOT}/@CMAKE_INSTALL_LIBDIR@/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
diff --git a/cmake/templates/oneCCLConfig.cmake.in b/cmake/templates/oneCCLConfig.cmake.in
index dd2b988ee..86b7de9f8 100644
--- a/cmake/templates/oneCCLConfig.cmake.in
+++ b/cmake/templates/oneCCLConfig.cmake.in
@@ -1,4 +1,3 @@
-# Default installation path: <oneccl_root>/lib/cmake/oneCCL/
 #
 # Copyright 2016-2020 Intel Corporation
 # 
@@ -14,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+# Default installation path: <oneccl_root>/lib/cmake/oneCCL/
 get_filename_component(_oneccl_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
 get_filename_component(_oneccl_root "${_oneccl_root}/../../../" ABSOLUTE)
 
@@ -33,7 +32,7 @@ if (_oneccl_subdir EQUAL "cpu_icc")
 endif()
 
 get_filename_component(_oneccl_headers "${_oneccl_root}/include/${_oneccl_subdir}" ABSOLUTE)
-get_filename_component(_oneccl_lib "${_oneccl_root}/lib/${_oneccl_subdir}" ABSOLUTE)
+get_filename_component(_oneccl_lib "${_oneccl_root}/lib/${_oneccl_subdir}/libccl.so" ABSOLUTE)
 
 if (EXISTS "${_oneccl_headers}" AND EXISTS "${_oneccl_lib}")
     if (NOT TARGET oneCCL)
diff --git a/cmake/templates/oneCCLConfigVersion.cmake.in b/cmake/templates/oneCCLConfigVersion.cmake.in
index 89d23cefd..d571a725c 100644
--- a/cmake/templates/oneCCLConfigVersion.cmake.in
+++ b/cmake/templates/oneCCLConfigVersion.cmake.in
@@ -1,4 +1,3 @@
-set(PACKAGE_VERSION @PROJECT_VERSION@)
 #
 # Copyright 2016-2020 Intel Corporation
 # 
@@ -14,13 +13,14 @@ set(PACKAGE_VERSION @PROJECT_VERSION@)
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-if ("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
-    set(PACKAGE_VERSION_COMPATIBLE FALSE)
-else()
-    set(PACKAGE_VERSION_COMPATIBLE TRUE)
-    if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
-        set(PACKAGE_VERSION_EXACT TRUE)
-    endif()
-endif()
-
+set(PACKAGE_VERSION @PROJECT_VERSION@)
+
+if ("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+    if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+        set(PACKAGE_VERSION_EXACT TRUE)
+    endif()
+endif()
+
diff --git a/cmake/vars.sh.in b/cmake/vars.sh.in
index e48ec98ec..5e47752ed 100644
--- a/cmake/vars.sh.in
+++ b/cmake/vars.sh.in
@@ -52,6 +52,17 @@ if [ "$_vars_this_script_name" = "$(_vars_get_proc_name "$0")" ] ; then
     return 255 2>/dev/null || exit 255
 fi
 
+prepend_path() (
+  path_to_add="$1"
+  path_is_now="$2"
+
+  if [ "" = "${path_is_now}" ] ; then   # avoid dangling ":"
+    printf "%s" "${path_to_add}"
+  else
+    printf "%s" "${path_to_add}:${path_is_now}"
+  fi
+)
+
 vars_script_name=""
 vars_script_shell="$(ps -p "$$" -o comm=)"
 
@@ -87,6 +98,7 @@ if [ "" = "$vars_script_name" ] ; then
     >&2 echo "   ERROR: Unable to proceed: possible causes listed below."
     >&2 echo "   This script must be sourced. Did you execute or source this script?" ;
     >&2 echo "   Unrecognized/unsupported shell (supported: bash, zsh, ksh, m/lksh, dash)." ;
+    >&2 echo "   May fail in dash if you rename this script (assumes \"vars.sh\")." ;
     >&2 echo "   Can be caused by sourcing from ZSH version 4.x or older." ;
     return 255 2>/dev/null || exit 255
 fi
@@ -95,6 +107,6 @@ WORK_DIR=$(get_script_path "${vars_script_name:-}")
 
 CCL_ROOT="$(cd "${WORK_DIR}"/../; pwd -P)"; export CCL_ROOT
 
-CPATH="${CCL_ROOT}/include/${CPATH+:${CPATH}}"; export CPATH
-LIBRARY_PATH="${CCL_ROOT}/lib/${LIBRARY_PATH+:${LIBRARY_PATH}}"; export LIBRARY_PATH
-LD_LIBRARY_PATH="${CCL_ROOT}/lib/${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}"; export LD_LIBRARY_PATH
+CPATH=$(prepend_path "${CCL_ROOT}/include" "${CPATH:-}"); export CPATH
+LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
+LD_LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LD_LIBRARY_PATH:-}"); export LD_LIBRARY_PATH
diff --git a/deps/hwloc/include/hwloc.h b/deps/hwloc/include/hwloc.h
new file mode 100644
index 000000000..6dd1159f7
--- /dev/null
+++ b/deps/hwloc/include/hwloc.h
@@ -0,0 +1,2479 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2021 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2020 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/*=====================================================================
+ *                 PLEASE GO READ THE DOCUMENTATION!
+ *         ------------------------------------------------
+ *               $tarball_directory/doc/doxygen-doc/
+ *                                or
+ *           https://www.open-mpi.org/projects/hwloc/doc/
+ *=====================================================================
+ *
+ * FAIR WARNING: Do NOT expect to be able to figure out all the
+ * subtleties of hwloc by simply reading function prototypes and
+ * constant descrptions here in this file.
+ *
+ * Hwloc has wonderful documentation in both PDF and HTML formats for
+ * your reading pleasure.  The formal documentation explains a LOT of
+ * hwloc-specific concepts, provides definitions, and discusses the
+ * "big picture" for many of the things that you'll find here in this
+ * header file.
+ *
+ * The PDF/HTML documentation was generated via Doxygen; much of what
+ * you'll see in there is also here in this file.  BUT THERE IS A LOT
+ * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
+ *
+ * There are entire paragraph-length descriptions, discussions, and
+ * pretty prictures to explain subtle corner cases, provide concrete
+ * examples, etc.
+ *
+ * Please, go read the documentation.  :-)
+ *
+ * Moreover there are several examples of hwloc use under doc/examples
+ * in the source tree.
+ *
+ *=====================================================================*/
+
+/** \file
+ * \brief The hwloc API.
+ *
+ * See hwloc/bitmap.h for bitmap specific macros.
+ * See hwloc/helper.h for high-level topology traversal helpers.
+ * See hwloc/inlines.h for the actual inline code of some functions below.
+ * See hwloc/export.h for exporting topologies to XML or to synthetic descriptions.
+ * See hwloc/distances.h for querying and modifying distances between objects.
+ * See hwloc/diff.h for manipulating differences between similar topologies.
+ */
+
+#ifndef HWLOC_H
+#define HWLOC_H
+
+#include "hwloc/autogen/config.h"
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+/*
+ * Symbol transforms
+ */
+#include "hwloc/rename.h"
+
+/*
+ * Bitmap definitions
+ */
+
+#include "hwloc/bitmap.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_api_version API version
+ * @{
+ */
+
+/** \brief Indicate at build time which hwloc API version is being used.
+ *
+ * This number is updated to (X<<16)+(Y<<8)+Z when a new release X.Y.Z
+ * actually modifies the API.
+ *
+ * Users may check for available features at build time using this number
+ * (see \ref faq_version_api).
+ *
+ * \note This should not be confused with HWLOC_VERSION, the library version.
+ * Two stable releases of the same series usually have the same ::HWLOC_API_VERSION
+ * even if their HWLOC_VERSION are different.
+ */
+#define HWLOC_API_VERSION 0x00020500
+
+/** \brief Indicate at runtime which hwloc API version was used at build time.
+ *
+ * Should be ::HWLOC_API_VERSION if running on the same version.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
+
+/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
+#define HWLOC_COMPONENT_ABI 7
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t)
+ *
+ * Hwloc uses bitmaps to represent two distinct kinds of object sets:
+ * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t).
+ * These types are both typedefs to a common back end type
+ * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions
+ * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see
+ * \ref hwlocality_bitmap).
+ *
+ * The rationale for having two different types is that even though
+ * the actions one wants to perform on these types are the same (e.g.,
+ * enable and disable individual items in the set/mask), they're used
+ * in very different contexts: one for specifying which processors to
+ * use and one for specifying which NUMA nodes to use.  Hence, the
+ * name difference is really just to reflect the intent of where the
+ * type is used.
+ *
+ * @{
+ */
+
+/** \brief A CPU set is a bitmap whose bits are set according to CPU
+ * physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ *
+ * Each bit may be converted into a PU object using
+ * hwloc_get_pu_obj_by_os_index().
+ */
+typedef hwloc_bitmap_t hwloc_cpuset_t;
+/** \brief A non-modifiable ::hwloc_cpuset_t. */
+typedef hwloc_const_bitmap_t hwloc_const_cpuset_t;
+
+/** \brief A node set is a bitmap whose bits are set according to NUMA
+ * memory node physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ * Each bit may be converted into a NUMA node object using
+ * hwloc_get_numanode_obj_by_os_index().
+ *
+ * When binding memory on a system without any NUMA node,
+ * the single main memory bank is considered as NUMA node #0.
+ *
+ * See also \ref hwlocality_helper_nodeset_convert.
+ */
+typedef hwloc_bitmap_t hwloc_nodeset_t;
+/** \brief A non-modifiable ::hwloc_nodeset_t.
+ */
+typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_types Object Types
+ * @{
+ */
+
+/** \brief Type of topology object.
+ *
+ * \note Do not rely on the ordering or completeness of the values as new ones
+ * may be defined in the future!  If you need to compare types, use
+ * hwloc_compare_types() instead.
+ */
+typedef enum {
+
+/** \cond */
+#define HWLOC_OBJ_TYPE_MIN HWLOC_OBJ_MACHINE /* Sentinel value */
+/** \endcond */
+
+  HWLOC_OBJ_MACHINE,	/**< \brief Machine.
+			  * A set of processors and memory with cache
+			  * coherency.
+			  *
+			  * This type is always used for the root object of a topology,
+			  * and never used anywhere else.
+			  * Hence its parent is always \c NULL.
+			  */
+
+  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package.
+			  * The physical package that usually gets inserted
+			  * into a socket on the motherboard.
+			  * A processor package usually contains multiple cores,
+			  * and possibly some dies.
+			  */
+  HWLOC_OBJ_CORE,	/**< \brief Core.
+			  * A computation unit (may be shared by several
+			  * PUs, aka logical processors).
+			  */
+  HWLOC_OBJ_PU,		/**< \brief Processing Unit, or (Logical) Processor.
+			  * An execution unit (may share a core with some
+			  * other logical processors, e.g. in the case of
+			  * an SMT core).
+			  *
+			  * This is the smallest object representing CPU resources,
+			  * it cannot have any child except Misc objects.
+			  *
+			  * Objects of this kind are always reported and can
+			  * thus be used as fallback when others are not.
+			  */
+
+  HWLOC_OBJ_L1CACHE,	/**< \brief Level 1 Data (or Unified) Cache. */
+  HWLOC_OBJ_L2CACHE,	/**< \brief Level 2 Data (or Unified) Cache. */
+  HWLOC_OBJ_L3CACHE,	/**< \brief Level 3 Data (or Unified) Cache. */
+  HWLOC_OBJ_L4CACHE,	/**< \brief Level 4 Data (or Unified) Cache. */
+  HWLOC_OBJ_L5CACHE,	/**< \brief Level 5 Data (or Unified) Cache. */
+
+  HWLOC_OBJ_L1ICACHE,	/**< \brief Level 1 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L2ICACHE,	/**< \brief Level 2 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L3ICACHE,	/**< \brief Level 3 instruction Cache (filtered out by default). */
+
+  HWLOC_OBJ_GROUP,	/**< \brief Group objects.
+			  * Objects which do not fit in the above but are
+			  * detected by hwloc and are useful to take into
+			  * account for affinity. For instance, some operating systems
+			  * expose their arbitrary processors aggregation this
+			  * way.  And hwloc may insert such objects to group
+			  * NUMA nodes according to their distances.
+			  * See also \ref faq_groups.
+			  *
+			  * These objects are removed when they do not bring
+			  * any structure (see ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+			  */
+
+  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
+			  * An object that contains memory that is directly
+			  * and byte-accessible to the host processors.
+			  * It is usually close to some cores (the corresponding objects
+			  * are descendants of the NUMA node object in the hwloc tree).
+			  *
+			  * This is the smallest object representing Memory resources,
+			  * it cannot have any child except Misc objects.
+			  * However it may have Memory-side cache parents.
+			  *
+			  * There is always at least one such object in the topology
+			  * even if the machine is not NUMA.
+			  *
+			  * Memory objects are not listed in the main children list,
+			  * but rather in the dedicated Memory children list.
+			  *
+			  * NUMA nodes have a special depth ::HWLOC_TYPE_DEPTH_NUMANODE
+			  * instead of a normal depth just like other objects in the
+			  * main tree.
+			  */
+
+  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge (filtered out by default).
+			  * Any bridge (or PCI switch) that connects the host or an I/O bus,
+			  * to another I/O bus.
+			  *
+			  * Bridges are not added to the topology unless their
+			  * filtering is changed (see hwloc_topology_set_type_filter()
+			  * and hwloc_topology_set_io_types_filter()).
+			  *
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device (filtered out by default).
+			  *
+			  * PCI devices are not added to the topology unless their
+			  * filtering is changed (see hwloc_topology_set_type_filter()
+			  * and hwloc_topology_set_io_types_filter()).
+			  *
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device (filtered out by default).
+			  *
+			  * OS devices are not added to the topology unless their
+			  * filtering is changed (see hwloc_topology_set_type_filter()
+			  * and hwloc_topology_set_io_types_filter()).
+			  *
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects (filtered out by default).
+			  * Objects without particular meaning, that can e.g. be
+			  * added by the application for its own use, or by hwloc
+			  * for miscellaneous objects such as MemoryModule (DIMMs).
+			  *
+			  * They are not added to the topology unless their filtering
+			  * is changed (see hwloc_topology_set_type_filter()).
+			  *
+			  * These objects are not listed in the main children list,
+			  * but rather in the dedicated misc children list.
+			  * Misc objects may only have Misc objects as children,
+			  * and those are in the dedicated misc children list as well.
+			  * Misc objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_MEMCACHE,	/**< \brief Memory-side cache (filtered out by default).
+			  * A cache in front of a specific NUMA node.
+			  *
+			  * This object always has at least one NUMA node as a memory child.
+			  *
+			  * Memory objects are not listed in the main children list,
+			  * but rather in the dedicated Memory children list.
+			  *
+			  * Memory-side cache have a special depth ::HWLOC_TYPE_DEPTH_MEMCACHE
+			  * instead of a normal depth just like other objects in the
+			  * main tree.
+			  */
+
+  HWLOC_OBJ_DIE,	/**< \brief Die within a physical package.
+			 * A subpart of the physical package, that contains multiple cores.
+			 */
+
+  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
+} hwloc_obj_type_t;
+
+/** \brief Cache type. */
+typedef enum hwloc_obj_cache_type_e {
+  HWLOC_OBJ_CACHE_UNIFIED,      /**< \brief Unified cache. */
+  HWLOC_OBJ_CACHE_DATA,         /**< \brief Data cache. */
+  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache (filtered out by default). */
+} hwloc_obj_cache_type_t;
+
+/** \brief Type of one side (upstream or downstream) of an I/O bridge. */
+typedef enum hwloc_obj_bridge_type_e {
+  HWLOC_OBJ_BRIDGE_HOST,	/**< \brief Host-side of a bridge, only possible upstream. */
+  HWLOC_OBJ_BRIDGE_PCI		/**< \brief PCI-side of a bridge. */
+} hwloc_obj_bridge_type_t;
+
+/** \brief Type of a OS device. */
+typedef enum hwloc_obj_osdev_type_e {
+  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device, or non-volatile memory device.
+				  * For instance "sda" or "dax2.0" on Linux. */
+  HWLOC_OBJ_OSDEV_GPU,		/**< \brief Operating system GPU device.
+				  * For instance ":0.0" for a GL display,
+				  * "card0" for a Linux DRM device. */
+  HWLOC_OBJ_OSDEV_NETWORK,	/**< \brief Operating system network device.
+				  * For instance the "eth0" interface on Linux. */
+  HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
+				  * For instance the "mlx4_0" InfiniBand HCA,
+				  * or "hfi1_0" Omni-Path interface on Linux. */
+  HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
+				  * For instance the "dma0chan0" DMA channel on Linux. */
+  HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
+				  * For instance "opencl0d0" for a OpenCL device,
+				  * "cuda0" for a CUDA device. */
+} hwloc_obj_osdev_type_t;
+
+/** \brief Compare the depth of two object types
+ *
+ * Types shouldn't be compared as they are, since newer ones may be added in
+ * the future.  This function returns less than, equal to, or greater than zero
+ * respectively if \p type1 objects usually include \p type2 objects, are the
+ * same as \p type2 objects, or are included in \p type2 objects. If the types
+ * can not be compared (because neither is usually contained in the other),
+ * ::HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
+ * be compared (usually, a system contains machines which contain nodes which
+ * contain packages which contain caches, which contain cores, which contain
+ * processors).
+ *
+ * \note ::HWLOC_OBJ_PU will always be the deepest,
+ * while ::HWLOC_OBJ_MACHINE is always the highest.
+ *
+ * \note This does not mean that the actual topology will respect that order:
+ * e.g. as of today cores may also contain caches, and packages may also contain
+ * nodes. This is thus just to be seen as a fallback comparison method.
+ */
+HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
+
+/** \brief Value returned by hwloc_compare_types() when types can not be compared. \hideinitializer */
+#define HWLOC_TYPE_UNORDERED INT_MAX
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_objects Object Structure and Attributes
+ * @{
+ */
+
+union hwloc_obj_attr_u;
+
+/** \brief Structure of a topology object
+ *
+ * Applications must not modify any field except \p hwloc_obj.userdata.
+ */
+struct hwloc_obj {
+  /* physical information */
+  hwloc_obj_type_t type;		/**< \brief Type of object */
+  char *subtype;			/**< \brief Subtype string to better describe the type field. */
+
+  unsigned os_index;			/**< \brief OS-provided physical index number.
+					 * It is not guaranteed unique across the entire machine,
+					 * except for PUs and NUMA nodes.
+					 * Set to HWLOC_UNKNOWN_INDEX if unknown or irrelevant for this object.
+					 */
+#define HWLOC_UNKNOWN_INDEX (unsigned)-1
+
+  char *name;				/**< \brief Object-specific name if any.
+					 * Mostly used for identifying OS devices and Misc objects where
+					 * a name string is more useful than numerical indexes.
+					 */
+
+  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in NUMA nodes below this object. */
+
+  union hwloc_obj_attr_u *attr;		/**< \brief Object type-specific Attributes,
+					 * may be \c NULL if no attribute value was found */
+
+  /* global position */
+  int depth;				/**< \brief Vertical index in the hierarchy.
+					 *
+					 * For normal objects, this is the depth of the horizontal level
+					 * that contains this object and its cousins of the same type.
+					 * If the topology is symmetric, this is equal to the parent depth
+					 * plus one, and also equal to the number of parent/child links
+					 * from the root object to here.
+					 *
+					 * For special objects (NUMA nodes, I/O and Misc) that are not
+					 * in the main tree, this is a special negative value that
+					 * corresponds to their dedicated level,
+					 * see hwloc_get_type_depth() and ::hwloc_get_type_depth_e.
+					 * Those special values can be passed to hwloc functions such
+					 * hwloc_get_nbobjs_by_depth() as usual.
+					 */
+  unsigned logical_index;		/**< \brief Horizontal index in the whole list of similar objects,
+					 * hence guaranteed unique across the entire machine.
+					 * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+					 * Note that this index may change when restricting the topology
+					 * or when inserting a group.
+					 */
+
+  /* cousins are all objects of the same type (and depth) across the entire topology */
+  struct hwloc_obj *next_cousin;	/**< \brief Next object of same type and depth */
+  struct hwloc_obj *prev_cousin;	/**< \brief Previous object of same type and depth */
+
+  /* children of the same parent are siblings, even if they may have different type and depth */
+  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (Machine object) */
+  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's Memory, I/O or Misc children list. */
+  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent (inside the same list of children). */
+  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent (inside the same list of children). */
+  /** @name List and array of normal children below this object (except Memory, I/O and Misc children). */
+  /**@{*/
+  unsigned arity;			/**< \brief Number of normal children.
+					 * Memory, Misc and I/O children are not listed here
+					 * but rather in their dedicated children list.
+					 */
+  struct hwloc_obj **children;		/**< \brief Normal children, \c children[0 .. arity -1] */
+  struct hwloc_obj *first_child;	/**< \brief First normal child */
+  struct hwloc_obj *last_child;		/**< \brief Last normal child */
+  /**@}*/
+
+  int symmetric_subtree;		/**< \brief Set if the subtree of normal objects below this object is symmetric,
+					  * which means all normal children and their children have identical subtrees.
+					  *
+					  * Memory, I/O and Misc children are ignored.
+					  *
+					  * If set in the topology root object, lstopo may export the topology
+					  * as a synthetic string.
+					  */
+
+  /** @name List of Memory children below this object. */
+  /**@{*/
+  unsigned memory_arity;		/**< \brief Number of Memory children.
+					 * These children are listed in \p memory_first_child.
+					 */
+  struct hwloc_obj *memory_first_child;	/**< \brief First Memory child.
+					 * NUMA nodes and Memory-side caches are listed here
+					 * (\p memory_arity and \p memory_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_memory().
+					 *
+					 * A memory hierarchy starts from a normal CPU-side object
+					 * (e.g. Package) and ends with NUMA nodes as leaves.
+					 * There might exist some memory-side caches between them
+					 * in the middle of the memory subtree.
+					 */
+  /**@}*/
+
+  /** @name List of I/O children below this object. */
+  /**@{*/
+  unsigned io_arity;			/**< \brief Number of I/O children.
+					 * These children are listed in \p io_first_child.
+					 */
+  struct hwloc_obj *io_first_child;	/**< \brief First I/O child.
+					 * Bridges, PCI and OS devices are listed here (\p io_arity and \p io_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_io().
+					 */
+  /**@}*/
+
+  /** @name List of Misc children below this object. */
+  /**@{*/
+  unsigned misc_arity;			/**< \brief Number of Misc children.
+					 * These children are listed in \p misc_first_child.
+					 */
+  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child.
+					 * Misc objects are listed here (\p misc_arity and \p misc_first_child)
+					 * instead of in the normal children list.
+					 */
+  /**@}*/
+
+  /* cpusets and nodesets */
+  hwloc_cpuset_t cpuset;		/**< \brief CPUs covered by this object
+                                          *
+                                          * This is the set of CPUs for which there are PU objects in the topology
+                                          * under this object, i.e. which are known to be physically contained in this
+                                          * object and known how (the children path between this object and the PU
+                                          * objects).
+                                          *
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
+                                          * some of these CPUs may not be allowed for binding,
+                                          * see hwloc_topology_get_allowed_cpuset().
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+  hwloc_cpuset_t complete_cpuset;       /**< \brief The complete CPU set of processors of this object,
+                                          *
+                                          * This may include not only the same as the cpuset field, but also some CPUs for
+                                          * which topology information is unknown or incomplete, some offlines CPUs, and
+                                          * the CPUs that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED flag
+                                          * is not set.
+                                          * Thus no corresponding PU object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be somewhere
+                                          * under this object.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+
+  hwloc_nodeset_t nodeset;              /**< \brief NUMA nodes covered by this object or containing this object
+                                          *
+                                          * This is the set of NUMA nodes for which there are NUMA node objects in the
+                                          * topology under or above this object, i.e. which are known to be physically
+                                          * contained in this object or containing it and known how (the children path
+                                          * between this object and the NUMA node objects).
+                                          *
+                                          * In the end, these nodes are those that are close to the current object.
+                                          * Function hwloc_get_local_numanode_objs() may be used to list those NUMA
+                                          * nodes more precisely.
+                                          *
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
+                                          * some of these nodes may not be allowed for allocation,
+                                          * see hwloc_topology_get_allowed_nodeset().
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p nodeset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+  hwloc_nodeset_t complete_nodeset;     /**< \brief The complete NUMA node set of this object,
+                                          *
+                                          * This may include not only the same as the nodeset field, but also some NUMA
+                                          * nodes for which topology information is unknown or incomplete, some offlines
+                                          * nodes, and the nodes that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
+                                          * flag is not set.
+                                          * Thus no corresponding NUMA node object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be
+                                          * somewhere under this object.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit is set in \p complete_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
+                                          */
+
+  struct hwloc_info_s *infos;		/**< \brief Array of stringified info type=name. */
+  unsigned infos_count;			/**< \brief Size of infos array. */
+
+  /* misc */
+  void *userdata;			/**< \brief Application-given private data pointer,
+					 * initialized to \c NULL, use it as you wish.
+					 * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
+					 * if you wish to export this field to XML. */
+
+  hwloc_uint64_t gp_index;			/**< \brief Global persistent index.
+					 * Generated by hwloc, unique across the topology (contrary to os_index)
+					 * and persistent across topology changes (contrary to logical_index).
+					 * Mostly used internally, but could also be used by application to identify objects.
+					 */
+};
+/**
+ * \brief Convenience typedef; a pointer to a struct hwloc_obj.
+ */
+typedef struct hwloc_obj * hwloc_obj_t;
+
+/** \brief Object type-specific Attributes */
+union hwloc_obj_attr_u {
+  /** \brief NUMA node-specific Object Attributes */
+  struct hwloc_numanode_attr_s {
+    hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+    unsigned page_types_len; /**< \brief Size of array \p page_types */
+    /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+     *
+     * The array is sorted by increasing \p size fields.
+     * It contains \p page_types_len slots.
+     */
+    struct hwloc_memory_page_type_s {
+      hwloc_uint64_t size;	/**< \brief Size of pages */
+      hwloc_uint64_t count;	/**< \brief Number of pages of this size */
+    } * page_types;
+  } numanode;
+
+  /** \brief Cache-specific Object Attributes */
+  struct hwloc_cache_attr_s {
+    hwloc_uint64_t size;		  /**< \brief Size of cache in bytes */
+    unsigned depth;			  /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */
+    unsigned linesize;			  /**< \brief Cache-line size in bytes. 0 if unknown */
+    int associativity;			  /**< \brief Ways of associativity,
+    					    *  -1 if fully associative, 0 if unknown */
+    hwloc_obj_cache_type_t type;          /**< \brief Cache type */
+  } cache;
+  /** \brief Group-specific Object Attributes */
+  struct hwloc_group_attr_s {
+    unsigned depth;			  /**< \brief Depth of group object.
+					   *   It may change if intermediate Group objects are added. */
+    unsigned kind;			  /**< \brief Internally-used kind of group. */
+    unsigned subkind;			  /**< \brief Internally-used subkind to distinguish different levels of groups with same kind */
+    unsigned char dont_merge;		  /**< \brief Flag preventing groups from being automatically merged with identical parent or children. */
+  } group;
+  /** \brief PCI Device specific Object Attributes */
+  struct hwloc_pcidev_attr_s {
+#ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN
+    unsigned short domain; /* Only 16bits PCI domains are supported by default */
+#else
+    unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */
+#endif
+    unsigned char bus, dev, func;
+    unsigned short class_id;
+    unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
+    unsigned char revision;
+    float linkspeed; /* in GB/s */
+  } pcidev;
+  /** \brief Bridge specific Object Attribues */
+  struct hwloc_bridge_attr_s {
+    union {
+      struct hwloc_pcidev_attr_s pci;
+    } upstream;
+    hwloc_obj_bridge_type_t upstream_type;
+    union {
+      struct {
+#ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN
+	unsigned short domain; /* Only 16bits PCI domains are supported by default */
+#else
+	unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */
+#endif
+	unsigned char secondary_bus, subordinate_bus;
+      } pci;
+    } downstream;
+    hwloc_obj_bridge_type_t downstream_type;
+    unsigned depth;
+  } bridge;
+  /** \brief OS Device specific Object Attributes */
+  struct hwloc_osdev_attr_s {
+    hwloc_obj_osdev_type_t type;
+  } osdev;
+};
+
+/** \brief Object info
+ *
+ * \sa hwlocality_info_attr
+ */
+struct hwloc_info_s {
+  char *name;	/**< \brief Info name */
+  char *value;	/**< \brief Info value */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_creation Topology Creation and Destruction
+ * @{
+ */
+
+struct hwloc_topology;
+/** \brief Topology context
+ *
+ * To be initialized with hwloc_topology_init() and built with hwloc_topology_load().
+ */
+typedef struct hwloc_topology * hwloc_topology_t;
+
+/** \brief Allocate a topology context.
+ *
+ * \param[out] topologyp is assigned a pointer to the new allocated context.
+ *
+ * \return 0 on success, -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
+
+/** \brief Build the actual topology
+ *
+ * Build the actual topology once initialized with hwloc_topology_init() and
+ * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines.
+ * No other routine may be called earlier using this topology context.
+ *
+ * \param topology is the topology to be loaded with objects.
+ *
+ * \return 0 on success, -1 on error.
+ *
+ * \note On failure, the topology is reinitialized. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ *
+ * \note This function may be called only once per topology.
+ *
+ * \note The binding of the current thread or process may temporarily change
+ * during this call but it will be restored before it returns.
+ *
+ * \sa hwlocality_configuration and hwlocality_setsource
+ */
+HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
+
+/** \brief Terminate and free a topology context
+ *
+ * \param topology is the topology to be freed
+ */
+HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
+
+/** \brief Duplicate a topology.
+ *
+ * The entire topology structure as well as its objects
+ * are duplicated into a new one.
+ *
+ * This is useful for keeping a backup while modifying a topology.
+ *
+ * \note Object userdata is not duplicated since hwloc does not know what it point to.
+ * The objects of both old and new topologies will point to the same userdata.
+ */
+HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
+
+/** \brief Verify that the topology is compatible with the current hwloc library.
+ *
+ * This is useful when using the same topology structure (in memory)
+ * in different libraries that may use different hwloc installations
+ * (for instance if one library embeds a specific version of hwloc,
+ * while another library uses a default system-wide hwloc installation).
+ *
+ * If all libraries/programs use the same hwloc installation, this function
+ * always returns success.
+ *
+ * \return \c 0 on success.
+ *
+ * \return \c -1 with \p errno set to \c EINVAL if incompatible.
+ *
+ * \note If sharing between processes with hwloc_shmem_topology_write(),
+ * the relevant check is already performed inside hwloc_shmem_topology_adopt().
+ */
+HWLOC_DECLSPEC int hwloc_topology_abi_check(hwloc_topology_t topology);
+
+/** \brief Run internal checks on a topology structure
+ *
+ * The program aborts if an inconsistency is detected in the given topology.
+ *
+ * \param topology is the topology to be checked
+ *
+ * \note This routine is only useful to developers.
+ *
+ * \note The input topology should have been previously loaded with
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_levels Object levels, depths and types
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Get the depth of the hierarchical tree of objects.
+ *
+ * This is the depth of ::HWLOC_OBJ_PU objects plus one.
+ *
+ * \note NUMA nodes, I/O and Misc objects are ignored when computing
+ * the depth of the tree (they are placed on special levels).
+ */
+HWLOC_DECLSPEC int hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type.
+ *
+ * If no object of this type is present on the underlying architecture, or if
+ * the OS doesn't provide this kind of information, the function returns
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN.
+ *
+ * If type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * If ::HWLOC_OBJ_GROUP is given, the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE
+ * if multiple levels of Groups exist.
+ *
+ * If a NUMA node, I/O or Misc object type is given, the function returns a virtual
+ * value because these objects are stored in special levels that are not CPU-related.
+ * This virtual depth may be passed to other hwloc functions such as
+ * hwloc_get_obj_by_depth() but it should not be considered as an actual
+ * depth by the application. In particular, it should not be compared with
+ * any other object depth or with the entire topology depth.
+ * \sa hwloc_get_memory_parents_depth().
+ *
+ * \sa hwloc_type_sscanf_as_depth() for returning the depth of objects
+ * whose type is given as a string.
+ */
+HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
+
+enum hwloc_get_type_depth_e {
+    HWLOC_TYPE_DEPTH_UNKNOWN = -1,    /**< \brief No object of given type exists in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology (only for Groups). \hideinitializer */
+    HWLOC_TYPE_DEPTH_NUMANODE = -3,   /**< \brief Virtual depth for NUMA nodes. \hideinitializer */
+    HWLOC_TYPE_DEPTH_BRIDGE = -4,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_PCI_DEVICE = -5, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_OS_DEVICE = -6,  /**< \brief Virtual depth for software device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MISC = -7,       /**< \brief Virtual depth for Misc object. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MEMCACHE = -8    /**< \brief Virtual depth for MemCache object. \hideinitializer */
+};
+
+/** \brief Return the depth of parents where memory objects are attached.
+ *
+ * Memory objects have virtual negative depths because they are not part of
+ * the main CPU-side hierarchy of objects. This depth should not be compared
+ * with other level depths.
+ *
+ * If all Memory objects are attached to Normal parents at the same depth,
+ * this parent depth may be compared to other as usual, for instance
+ * for knowing whether NUMA nodes is attached above or below Packages.
+ *
+ * \return The depth of Normal parents of all memory children
+ * if all these parents have the same depth. For instance the depth of
+ * the Package level if all NUMA nodes are attached to Package objects.
+ *
+ * \return ::HWLOC_TYPE_DEPTH_MULTIPLE if Normal parents of all
+ * memory children do not have the same depth. For instance if some
+ * NUMA nodes are attached to Packages while others are attached to
+ * Groups.
+ */
+HWLOC_DECLSPEC int hwloc_get_memory_parents_depth (hwloc_topology_t topology);
+
+/** \brief Returns the depth of objects of type \p type or below
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically found
+ * inside \p type.
+ *
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
+ */
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type or above
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically
+ * containing \p type.
+ *
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
+ */
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the type of objects at depth \p depth.
+ *
+ * \p depth should between 0 and hwloc_topology_get_depth()-1,
+ * or a virtual depth such as ::HWLOC_TYPE_DEPTH_NUMANODE.
+ *
+ * \return (hwloc_obj_type_t)-1 if depth \p depth does not exist.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level at depth \p depth.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level type \p type
+ *
+ * If no object for that type exists, 0 is returned.
+ * If there are several levels with objects of that type, -1 is returned.
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the top-object of the topology-tree.
+ *
+ * Its type is ::HWLOC_OBJ_MACHINE.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, int depth, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx with type \p type
+ *
+ * If no object for that type exists, \c NULL is returned.
+ * If there are several levels with objects of that type (::HWLOC_OBJ_GROUP),
+ * \c NULL is returned and the caller may fallback to hwloc_get_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the next object at depth \p depth.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev);
+
+/** \brief Returns the next object of type \p type.
+ *
+ * If \p prev is \c NULL, return the first object at type \p type.  If
+ * there are multiple or no depth for given type, return \c NULL and
+ * let the caller fallback to hwloc_get_next_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_strings Converting between Object Types and Attributes, and Strings
+ * @{
+ */
+
+/** \brief Return a constant stringified object type.
+ *
+ * This function is the basic way to convert a generic type into a string.
+ * The output string may be parsed back by hwloc_type_sscanf().
+ *
+ * hwloc_obj_type_snprintf() may return a more precise output for a specific
+ * object, but it requires the caller to provide the output buffer.
+ */
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
+
+/** \brief Stringify the type of a given topology object into a human-readable form.
+ *
+ * Contrary to hwloc_obj_type_string(), this function includes object-specific
+ * attributes (such as the Group depth, the Bridge type, or OS device type)
+ * in the output, and it requires the caller to provide the output buffer.
+ *
+ * The output is guaranteed to be the same for all objects of a same topology level.
+ *
+ * If \p verbose is 1, longer type names are used, e.g. L1Cache instead of L1.
+ *
+ * The output string may be parsed back by hwloc_type_sscanf().
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj,
+					   int verbose);
+
+/** \brief Stringify the attributes of a given topology object into a human-readable form.
+ *
+ * Attribute values are separated by \p separator.
+ *
+ * Only the major attributes are printed in non-verbose mode.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj, const char * __hwloc_restrict separator,
+					   int verbose);
+
+/** \brief Return an object type and attributes from a type string.
+ *
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
+ *
+ * The matched object type is set in \p typep (which cannot be \c NULL).
+ *
+ * Type-specific attributes, for instance Cache type, Cache depth, Group depth,
+ * Bridge type or OS Device type may be returned in \p attrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p attrp is only filled if not \c NULL and if its size specified in \p attrsize
+ * is large enough. It should be at least as large as union hwloc_obj_attr_u.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_sscanf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf(const char *string,
+				     hwloc_obj_type_t *typep,
+				     union hwloc_obj_attr_u *attrp, size_t attrsize);
+
+/** \brief Return an object type and its level depth from a type string.
+ *
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types
+ * and return in \p depthp the depth of the corresponding level in the
+ * topology \p topology.
+ *
+ * If no object of this type is present on the underlying architecture,
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If multiple such levels exist (for instance if giving Group without any depth),
+ * the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE instead.
+ *
+ * The matched object type is set in \p typep if \p typep is non \c NULL.
+ *
+ * \note This function is similar to hwloc_type_sscanf() followed
+ * by hwloc_get_type_depth() but it also automatically disambiguates
+ * multiple group levels etc.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf_as_depth(const char *string,
+					      hwloc_obj_type_t *typep,
+					      hwloc_topology_t topology, int *depthp);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_info_attr Consulting and Adding Key-Value Info Attributes
+ *
+ * @{
+ */
+
+/** \brief Search the given key name in object infos and return the corresponding value.
+ *
+ * If multiple keys match the given name, only the first one is returned.
+ *
+ * \return \c NULL if no such key exists.
+ */
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
+
+/** \brief Add the given info name and value pair to the given object.
+ *
+ * The info is appended to the existing info array even if another key
+ * with the same name already exists.
+ *
+ * The input strings are copied before being added in the object infos.
+ *
+ * \return \c 0 on success, \c -1 on error.
+ *
+ * \note This function may be used to enforce object colors in the lstopo
+ * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
+ *
+ * \note If \p value contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_cpubinding CPU binding
+ *
+ * Some operating systems only support binding threads or processes to a single PU.
+ * Others allow binding to larger sets such as entire Cores or Packages or
+ * even random sets of invididual PUs. In such operating system, the scheduler
+ * is free to run the task on one of these PU, then migrate it to another PU, etc.
+ * It is often useful to call hwloc_bitmap_singlify() on the target CPU set before
+ * passing it to the binding function to avoid these expensive migrations.
+ * See the documentation of hwloc_bitmap_singlify() for details.
+ *
+ * Some operating systems do not provide all hwloc-supported
+ * mechanisms to bind processes, threads, etc.
+ * hwloc_topology_get_support() may be used to query about the actual CPU
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1.
+ * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object
+ * processes/threads. errno is set to \c EXDEV when the requested cpuset
+ * can not be enforced (e.g. some systems only allow one CPU, and some
+ * other systems only allow one NUMA node).
+ *
+ * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable version that should be preferred over the others,
+ * whenever possible, is the following one which just binds the current program,
+ * assuming it is single-threaded:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, 0),
+ * \endcode
+ *
+ * If the program may be multithreaded, the following one should be preferred
+ * to only bind the current thread:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD),
+ * \endcode
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note To unbind, just call the binding function with either a full cpuset or
+ * a cpuset equal to the system cpuset.
+ *
+ * \note On some operating systems, CPU binding may have effects on memory binding, see
+ * ::HWLOC_CPUBIND_NOMEMBIND
+ *
+ * \note Running lstopo \--top or hwloc-ps can be a very convenient tool to check
+ * how binding actually happened.
+ * @{
+ */
+
+/** \brief Process/Thread binding flags.
+ *
+ * These bit flags can be used to refine the binding policy.
+ *
+ * The default (0) is to bind the current process, assumed to be
+ * single-threaded, in a non-strict way.  This is the most portable
+ * way to bind as all operating systems usually provide it.
+ *
+ * \note Not all systems support all kinds of binding.  See the
+ * "Detailed Description" section of \ref hwlocality_cpubinding for a
+ * description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Bind all threads of the current (possibly) multithreaded process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_PROCESS = (1<<0),
+
+  /** \brief Bind current thread of current process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_THREAD = (1<<1),
+
+  /** \brief Request for strict binding from the OS.
+   *
+   * By default, when the designated CPUs are all busy while other
+   * CPUs are idle, operating systems may execute the thread/process
+   * on those other CPUs instead of the designated CPUs, to let them
+   * progress anyway.  Strict binding means that the thread/process
+   * will _never_ execute on other cpus than the designated CPUs, even
+   * when those are busy with other tasks and other CPUs are idle.
+   *
+   * \note Depending on the operating system, strict binding may not
+   * be possible (e.g., the OS does not implement it) or not allowed
+   * (e.g., for an administrative reasons), and the function will fail
+   * in that case.
+   *
+   * When retrieving the binding of a process, this flag checks
+   * whether all its threads  actually have the same binding. If the
+   * flag is not given, the binding of each thread will be
+   * accumulated.
+   *
+   * \note This flag is meaningless when retrieving the binding of a
+   * thread.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_STRICT = (1<<2),
+
+  /** \brief Avoid any effect on memory binding
+   *
+   * On some operating systems, some CPU binding function would also
+   * bind the memory on the corresponding NUMA node.  It is often not
+   * a problem for the application, but if it is, setting this flag
+   * will make hwloc avoid using OS functions that would also bind
+   * memory.  This will however reduce the support of CPU bindings,
+   * i.e. potentially return -1 with errno set to ENOSYS in some
+   * cases.
+   *
+   * This flag is only meaningful when used with functions that set
+   * the CPU binding.  It is ignored when used with functions that get
+   * CPU binding information.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_NOMEMBIND = (1<<3)
+} hwloc_cpubind_flags_t;
+
+/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get current process or thread binding.
+ *
+ * Writes into \p set the physical cpuset which the process or thread (according to \e
+ * flags) was last bound to.
+ */
+HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding is applied to that specific thread.
+ *
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get the current physical binding of process \p pid.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding for that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+#ifdef hwloc_thread_t
+/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
+#endif
+
+#ifdef hwloc_thread_t
+/** \brief Get the current physical binding of thread \p tid.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
+#endif
+
+/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \p flags can include either ::HWLOC_CPUBIND_PROCESS or ::HWLOC_CPUBIND_THREAD to
+ * specify whether the query should be for the whole process (union of all CPUs
+ * on which all threads are running), or only the current thread. If the
+ * process is single-threaded, flags can be set to zero to let hwloc use
+ * whichever method is available on the underlying OS.
+ */
+HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Get the last physical CPU where a process ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
+ * the last CPU location of that specific thread is returned.
+ *
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_membinding Memory binding
+ *
+ * Memory binding can be done three ways:
+ *
+ * - explicit memory allocation thanks to hwloc_alloc_membind() and friends:
+ *   the binding will have effect on the memory allocated by these functions.
+ * - implicit memory binding through binding policy: hwloc_set_membind() and
+ *   friends only define the current policy of the process, which will be
+ *   applied to the subsequent calls to malloc() and friends.
+ * - migration of existing memory ranges, thanks to hwloc_set_area_membind()
+ *   and friends, which move already-allocated data.
+ *
+ * Not all operating systems support all three ways.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1.
+ * \p errno will be set to \c ENOSYS when the system does support
+ * the specified action or policy
+ * (e.g., some systems only allow binding memory on a per-thread
+ * basis, whereas other systems only allow binding memory for all
+ * threads in a process).
+ * \p errno will be set to EXDEV when the requested set can not be enforced
+ * (e.g., some systems only allow binding memory to a single NUMA node).
+ *
+ * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable form that should be preferred over the others
+ * whenever possible is as follows.
+ * It allocates some memory hopefully bound to the specified set.
+ * To do so, hwloc will possibly have to change the current memory
+ * binding policy in order to actually get the memory bound, if the OS
+ * does not provide any other way to simply allocate bound memory
+ * without changing the policy for all allocations. That is the
+ * difference with hwloc_alloc_membind(), which will never change the
+ * current memory binding policy.
+ *
+ * \code
+ * hwloc_alloc_membind_policy(topology, size, set,
+ *                            HWLOC_MEMBIND_BIND, 0);
+ * \endcode
+ *
+ * Each hwloc memory binding function takes a bitmap argument that
+ * is a CPU set by default, or a NUMA memory node set if the flag
+ * ::HWLOC_MEMBIND_BYNODESET is specified.
+ * See \ref hwlocality_object_sets and \ref hwlocality_bitmap for a
+ * discussion of CPU sets and NUMA memory node sets.
+ * It is also possible to convert between CPU set and node set using
+ * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
+ *
+ * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+ * Binding by nodeset should therefore be preferred whenever possible.
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note On some operating systems, memory binding affects the CPU
+ * binding; see ::HWLOC_MEMBIND_NOCPUBIND
+ * @{
+ */
+
+/** \brief Memory binding policy.
+ *
+ * These constants can be used to choose the binding policy.  Only one policy can
+ * be used at a time (i.e., the values cannot be OR'ed together).
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding policy support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Reset the memory allocation policy to the system default.
+   * Depending on the operating system, this may correspond to
+   * ::HWLOC_MEMBIND_FIRSTTOUCH (Linux, FreeBSD),
+   * or ::HWLOC_MEMBIND_BIND (AIX, HP-UX, Solaris, Windows).
+   * This policy is never returned by get membind functions.
+   * The nodeset argument is ignored.
+   * \hideinitializer */
+  HWLOC_MEMBIND_DEFAULT =	0,
+
+  /** \brief Allocate each memory page individually on the local NUMA
+   * node of the thread that touches it.
+   *
+   * The given nodeset should usually be hwloc_topology_get_topology_nodeset()
+   * so that the touching thread may run and allocate on any node in the system.
+   *
+   * On AIX, if the nodeset is smaller, pages are allocated locally (if the local
+   * node is in the nodeset) or from a random non-local node (otherwise).
+   * \hideinitializer */
+  HWLOC_MEMBIND_FIRSTTOUCH =	1,
+
+  /** \brief Allocate memory on the specified nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_BIND =		2,
+
+  /** \brief Allocate memory on the given nodes in an interleaved
+   * / round-robin manner.  The precise layout of the memory across
+   * multiple NUMA nodes is OS/system specific. Interleaving can be
+   * useful when threads distributed across the specified NUMA nodes
+   * will all be accessing the whole memory range concurrently, since
+   * the interleave will then balance the memory references.
+   * \hideinitializer */
+  HWLOC_MEMBIND_INTERLEAVE =	3,
+
+  /** \brief For each page bound with this policy, by next time
+   * it is touched (and next time only), it is moved from its current
+   * location to the local NUMA node of the thread where the memory
+   * reference occurred (if it needs to be moved at all).
+   * \hideinitializer */
+  HWLOC_MEMBIND_NEXTTOUCH =	4,
+
+  /** \brief Returned by get_membind() functions when multiple
+   * threads or parts of a memory area have differing memory binding
+   * policies.
+   * Also returned when binding is unknown because binding hooks are empty
+   * when the topology is loaded from XML without HWLOC_THISSYSTEM=1, etc.
+   * \hideinitializer */
+  HWLOC_MEMBIND_MIXED = -1
+} hwloc_membind_policy_t;
+
+/** \brief Memory binding flags.
+ *
+ * These flags can be used to refine the binding policy.
+ * All flags can be logically OR'ed together with the exception of
+ * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD;
+ * these two flags are mutually exclusive.
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Set policy for all threads of the specified (possibly
+   * multithreaded) process.  This flag is mutually exclusive with
+   * ::HWLOC_MEMBIND_THREAD.
+   * \hideinitializer */
+  HWLOC_MEMBIND_PROCESS =       (1<<0),
+
+ /** \brief Set policy for a specific thread of the current process.
+  * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS.
+  * \hideinitializer */
+  HWLOC_MEMBIND_THREAD =        (1<<1),
+
+ /** Request strict binding from the OS.  The function will fail if
+  * the binding can not be guaranteed / completely enforced.
+  *
+  * This flag has slightly different meanings depending on which
+  * function it is used with.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_STRICT =        (1<<2),
+
+ /** \brief Migrate existing allocated memory.  If the memory cannot
+  * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error
+  * will be returned.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_MIGRATE =       (1<<3),
+
+  /** \brief Avoid any effect on CPU binding.
+   *
+   * On some operating systems, some underlying memory binding
+   * functions also bind the application to the corresponding CPU(s).
+   * Using this flag will cause hwloc to avoid using OS functions that
+   * could potentially affect CPU bindings.  Note, however, that using
+   * NOCPUBIND may reduce hwloc's overall memory binding
+   * support. Specifically: some of hwloc's memory binding functions
+   * may fail with errno set to ENOSYS when used with NOCPUBIND.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_NOCPUBIND =     (1<<4),
+
+  /** \brief Consider the bitmap argument as a nodeset.
+   *
+   * The bitmap argument is considered a nodeset if this flag is given,
+   * or a cpuset otherwise by default.
+   *
+   * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+   * Binding by nodeset should therefore be preferred whenever possible.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_BYNODESET =     (1<<5)
+} hwloc_membind_flags_t;
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by \p set
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the values are returned in \p set and \p
+ * policy.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default set
+ * from each thread is logically OR'ed together.
+ * If all threads' default policies are the same, \p policy is set to
+ * that policy.  If they are different, \p policy is set to
+ * ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one set and policy; they are returned in \p set and
+ * \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by \p set
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the values are returned in \p
+ * set and \p policy.
+ *
+ * Otherwise, \p set is set to the logical OR of all threads'
+ * default set.  If all threads' default policies
+ * are the same, \p policy is set to that policy.  If they are
+ * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) specified by \p set.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \return 0 if \p len is 0.
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p set and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the set and policy are
+ * returned in \p set and \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * If \p len is 0, -1 is returned and errno is set to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated.
+ *
+ * Fills \p set according to the NUMA nodes where the memory area pages
+ * are physically allocated. If no page is actually allocated yet,
+ * \p set may be empty.
+ *
+ * If pages spread to multiple nodes, it is not specified whether they spread
+ * equitably, or whether most of them are on a single node, etc.
+ *
+ * The operating system may move memory pages from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified in \p flags, set is
+ * considered a nodeset. Otherwise it's a cpuset.
+ *
+ * If \p len is 0, \p set is emptied.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, int flags);
+
+/** \brief Allocate some memory
+ *
+ * This is equivalent to malloc(), except that it tries to allocate
+ * page-aligned memory from the OS.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
+
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
+ *
+ * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change
+ * the current memory binding policy, thus providing more binding support, at
+ * the expense of changing the current state.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Free memory that was previously allocated by hwloc_alloc()
+ * or hwloc_alloc_membind().
+ */
+HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * If none of the functions below is called, the default is to detect all the objects
+ * of the machine that the caller is allowed to access.
+ *
+ * This default behavior may also be modified through environment variables
+ * if the application did not modify it already.
+ * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML
+ * file as if hwloc_topology_set_xml() had been called.
+ * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
+ * hwloc_topology_set_synthetic() had been called.
+ *
+ * Finally, HWLOC_THISSYSTEM enforces the return value of
+ * hwloc_topology_is_thissystem().
+ *
+ * @{
+ */
+
+/** \brief Change which process the topology is viewed from.
+ *
+ * On some systems, processes may have different views of the machine, for
+ * instance the set of allowed CPUs. By default, hwloc exposes the view from
+ * the current process. Calling hwloc_topology_set_pid() permits to make it
+ * expose the topology of the machine from the point of view of another
+ * process.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * support this feature.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
+
+/** \brief Enable synthetic topology.
+ *
+ * Gather topology information from the given \p description,
+ * a space-separated string of <type:number> describing
+ * the object type and arity at each level.
+ * All types may be omitted (space-separated string of numbers) so that
+ * hwloc chooses all types according to usual topologies.
+ * See also the \ref synthetic.
+ *
+ * Setting the environment variable HWLOC_SYNTHETIC
+ * may also result in this behavior.
+ *
+ * If \p description was properly parsed and describes a valid topology
+ * configuration, this function returns 0.
+ * Otherwise -1 is returned and errno is set to EINVAL.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.
+ *
+ * \note On success, the synthetic component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description);
+
+/** \brief Enable XML-file based topology.
+ *
+ * Gather topology information from the XML file given at \p xmlpath.
+ * Setting the environment variable HWLOC_XMLFILE may also result in this behavior.
+ * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h,
+ * or lstopo file.xml.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML file.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
+
+/** \brief Enable XML based topology using a memory buffer (instead of
+ * a file, as with hwloc_topology_set_xml()).
+ *
+ * Gather topology information from the XML memory buffer given at \p
+ * buffer and of length \p size.  This buffer may have been filled
+ * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
+
+/** \brief Flags to be passed to hwloc_topology_set_components()
+ */
+enum hwloc_topology_components_flag_e {
+  /** \brief Blacklist the target component from being used.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST = (1UL<<0)
+};
+
+/** \brief Prevent a discovery component from being used for a topology.
+ *
+ * \p name is the name of the discovery component that should not be used
+ * when loading topology \p topology. The name is a string such as "cuda".
+ *
+ * For components with multiple phases, it may also be suffixed with the name
+ * of a phase, for instance "linux:io".
+ *
+ * \p flags should be ::HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST.
+ *
+ * This may be used to avoid expensive parts of the discovery process.
+ * For instance, CUDA-specific discovery may be expensive and unneeded
+ * while generic I/O discovery could still be useful.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_components(hwloc_topology_t __hwloc_restrict topology, unsigned long flags, const char * __hwloc_restrict name);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_configuration Topology Detection Configuration and Query
+ *
+ * Several functions can optionally be called between hwloc_topology_init() and
+ * hwloc_topology_load() to configure how the detection should be performed,
+ * e.g. to ignore some objects types, define a synthetic topology, etc.
+ *
+ * @{
+ */
+
+/** \brief Flags to be set onto a topology context before load.
+ *
+ * Flags should be given to hwloc_topology_set_flags().
+ * They may also be returned by hwloc_topology_get_flags().
+ */
+enum hwloc_topology_flags_e {
+ /** \brief Detect the whole system, ignore reservations, include disallowed objects.
+   *
+   * Gather all resources, even if some were disabled by the administrator.
+   * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
+   *
+   * When this flag is not set, PUs and NUMA nodes that are disallowed are not added to the topology.
+   * Parent objects (package, core, cache, etc.) are added only if some of their children are allowed.
+   * All existing PUs and NUMA nodes in the topology are allowed.
+   * hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset()
+   * are equal to the root object cpuset and nodeset.
+   *
+   * When this flag is set, the actual sets of allowed PUs and NUMA nodes are given
+   * by hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset().
+   * They may be smaller than the root object cpuset and nodeset.
+   *
+   * If the current topology is exported to XML and reimported later, this flag
+   * should be set again in the reimported topology so that disallowed resources
+   * are reimported as well.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED = (1UL<<0),
+
+ /** \brief Assume that the selected backend provides the topology for the
+   * system on which we are running.
+   *
+   * This forces hwloc_topology_is_thissystem() to return 1, i.e. makes hwloc assume that
+   * the selected backend provides the topology for the system on which we are running,
+   * even if it is not the OS-specific backend but the XML backend for instance.
+   * This means making the binding functions actually call the OS-specific
+   * system calls and really do binding, while the XML backend would otherwise
+   * provide empty hooks just returning success.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM may also result in the
+   * same behavior.
+   *
+   * This can be used for efficiency reasons to first detect the topology once,
+   * save it to an XML file, and quickly reload it later through the XML
+   * backend, but still having binding functions actually do bind.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
+
+ /** \brief Get the set of allowed resources from the local operating system even if the topology was loaded from XML or synthetic description.
+   *
+   * If the topology was loaded from XML or from a synthetic string,
+   * restrict it by applying the current process restrictions such as
+   * Linux Cgroup/Cpuset.
+   *
+   * This is useful when the topology is not loaded directly from
+   * the local machine (e.g. for performance reason) and it comes
+   * with all resources, while the running process is restricted
+   * to only parts of the machine.
+   *
+   * This flag is ignored unless ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM is
+   * also set since the loaded topology must match the underlying machine
+   * where restrictions will be gathered from.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM_ALLOWED_RESOURCES
+   * would result in the same behavior.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES = (1UL<<2),
+
+  /** \brief Import support from the imported topology.
+   *
+   * When importing a XML topology from a remote machine, binding is
+   * disabled by default (see ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM).
+   * This disabling is also marked by putting zeroes in the corresponding
+   * supported feature bits reported by hwloc_topology_get_support().
+   *
+   * The flag ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT actually imports
+   * support bits from the remote machine. It also sets the flag
+   * \p imported_support in the struct hwloc_topology_misc_support array.
+   * If the imported XML did not contain any support information
+   * (exporter hwloc is too old), this flag is not set.
+   *
+   * Note that these supported features are only relevant for the hwloc
+   * installation that actually exported the XML topology
+   * (it may vary with the operating system, or with how hwloc was compiled).
+   *
+   * Note that setting this flag however does not enable binding for the
+   * locally imported hwloc topology, it only reports what the remote
+   * hwloc and machine support.
+   *
+   */
+  HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT = (1UL<<3)
+};
+
+/** \brief Set OR'ed flags to non-yet-loaded topology.
+ *
+ * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
+ *
+ * If this function is called multiple times, the last invokation will erase
+ * and replace the set of flags that was previously set.
+ *
+ * The flags set in a topology may be retrieved with hwloc_topology_get_flags()
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
+
+/** \brief Get OR'ed flags of a topology.
+ *
+ * Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
+ *
+ * \return the flags previously set with hwloc_topology_set_flags().
+ */
+HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
+
+/** \brief Does the topology context come from this system?
+ *
+ * \return 1 if this topology context was built using the system
+ * running this program.
+ * \return 0 instead (for instance if using another file-system root,
+ * a XML topology file, or a synthetic topology).
+ */
+HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Flags describing actual discovery support for this topology. */
+struct hwloc_topology_discovery_support {
+  /** \brief Detecting the number of PU objects is supported. */
+  unsigned char pu;
+  /** \brief Detecting the number of NUMA nodes is supported. */
+  unsigned char numa;
+  /** \brief Detecting the amount of memory in NUMA nodes is supported. */
+  unsigned char numa_memory;
+  /** \brief Detecting and identifying PU objects that are not available to the current process is supported. */
+  unsigned char disallowed_pu;
+  /** \brief Detecting and identifying NUMA nodes that are not available to the current process is supported. */
+  unsigned char disallowed_numa;
+  /** \brief Detecting the efficiency of CPU kinds is supported, see \ref hwlocality_cpukinds. */
+  unsigned char cpukind_efficiency;
+};
+
+/** \brief Flags describing actual PU binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
+struct hwloc_topology_cpubind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_cpubind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_cpubind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_cpubind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_cpubind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_cpubind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_cpubind;
+  /** Binding a given thread only is supported.  */
+  unsigned char set_thread_cpubind;
+  /** Getting the binding of a given thread only is supported.  */
+  unsigned char get_thread_cpubind;
+  /** Getting the last processors where the whole current process ran is supported */
+  unsigned char get_thisproc_last_cpu_location;
+  /** Getting the last processors where a whole process ran is supported */
+  unsigned char get_proc_last_cpu_location;
+  /** Getting the last processors where the current thread ran is supported */
+  unsigned char get_thisthread_last_cpu_location;
+};
+
+/** \brief Flags describing actual memory binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
+struct hwloc_topology_membind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_membind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_membind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_membind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_membind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_membind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_membind;
+  /** Binding a given memory area is supported. */
+  unsigned char set_area_membind;
+  /** Getting the binding of a given memory area is supported.  */
+  unsigned char get_area_membind;
+  /** Allocating a bound memory area is supported. */
+  unsigned char alloc_membind;
+  /** First-touch policy is supported. */
+  unsigned char firsttouch_membind;
+  /** Bind policy is supported. */
+  unsigned char bind_membind;
+  /** Interleave policy is supported. */
+  unsigned char interleave_membind;
+  /** Next-touch migration policy is supported. */
+  unsigned char nexttouch_membind;
+  /** Migration flags is supported. */
+  unsigned char migrate_membind;
+  /** Getting the last NUMA nodes where a memory area was allocated is supported */
+  unsigned char get_area_memlocation;
+};
+
+/** \brief Flags describing miscellaneous features.
+ */
+struct hwloc_topology_misc_support {
+  /** Support was imported when importing another topology, see ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT. */
+  unsigned char imported_support;
+};
+
+/** \brief Set of flags describing actual support for this topology.
+ *
+ * This is retrieved with hwloc_topology_get_support() and will be valid until
+ * the topology object is destroyed.  Note: the values are correct only after
+ * discovery.
+ */
+struct hwloc_topology_support {
+  struct hwloc_topology_discovery_support *discovery;
+  struct hwloc_topology_cpubind_support *cpubind;
+  struct hwloc_topology_membind_support *membind;
+  struct hwloc_topology_misc_support *misc;
+};
+
+/** \brief Retrieve the topology support.
+ *
+ * Each flag indicates whether a feature is supported.
+ * If set to 0, the feature is not supported.
+ * If set to 1, the feature is supported, but the corresponding
+ * call may still fail in some corner cases.
+ *
+ * These features are also listed by hwloc-info \--support
+ *
+ * The reported features are what the current topology supports
+ * on the current machine. If the topology was exported to XML
+ * from another machine and later imported here, support still
+ * describes what is supported for this imported topology after
+ * import. By default, binding will be reported as unsupported
+ * in this case (see ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM).
+ *
+ * Topology flag ::HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT may be used
+ * to report the supported features of the original remote machine
+ * instead. If it was successfully imported, \p imported_support
+ * will be set in the struct hwloc_topology_misc_support array.
+ */
+HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
+
+/** \brief Type filtering flags.
+ *
+ * By default, most objects are kept (::HWLOC_TYPE_FILTER_KEEP_ALL).
+ * Instruction caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ * Die and Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+ *
+ * Note that group objects are also ignored individually (without the entire level)
+ * when they do not bring structure.
+ */
+enum hwloc_type_filter_e {
+  /** \brief Keep all objects of this type.
+   *
+   * Cannot be set for ::HWLOC_OBJ_GROUP (groups are designed only to add more structure to the topology).
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_ALL = 0,
+
+  /** \brief Ignore all objects of this type.
+   *
+   * The bottom-level type ::HWLOC_OBJ_PU, the ::HWLOC_OBJ_NUMANODE type, and
+   * the top-level type ::HWLOC_OBJ_MACHINE may not be ignored.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_NONE = 1,
+
+  /** \brief Only ignore objects if their entire level does not bring any structure.
+   *
+   * Keep the entire level of objects if at least one of these objects adds
+   * structure to the topology. An object brings structure when it has multiple
+   * children and it is not the only child of its parent.
+   *
+   * If all objects in the level are the only child of their parent, and if none
+   * of them has multiple children, the entire level is removed.
+   *
+   * Cannot be set for I/O and Misc objects since the topology structure does not matter there.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_STRUCTURE = 2,
+
+  /** \brief Only keep likely-important objects of the given type.
+   *
+   * It is only useful for I/O object types.
+   * For ::HWLOC_OBJ_PCI_DEVICE and ::HWLOC_OBJ_OS_DEVICE, it means that only objects
+   * of major/common kinds are kept (storage, network, OpenFabrics, CUDA,
+   * OpenCL, RSMI, NVML, and displays).
+   * Also, only OS devices directly attached on PCI (e.g. no USB) are reported.
+   * For ::HWLOC_OBJ_BRIDGE, it means that bridges are kept only if they have children.
+   *
+   * This flag equivalent to ::HWLOC_TYPE_FILTER_KEEP_ALL for Normal, Memory and Misc types
+   * since they are likely important.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_IMPORTANT = 3
+};
+
+/** \brief Set the filtering for the given object type.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter);
+
+/** \brief Get the current filtering for the given object type.
+ */
+HWLOC_DECLSPEC int hwloc_topology_get_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filter);
+
+/** \brief Set the filtering for all object types.
+ *
+ * If some types do not support this filtering, they are silently ignored.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_all_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all CPU cache object types.
+ *
+ * Memory-side caches are not involved since they are not CPU caches.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all CPU instruction cache object types.
+ *
+ * Memory-side caches are not involved since they are not CPU caches.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all I/O object types.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the topology-specific userdata pointer.
+ *
+ * Each topology may store one application-given private data pointer.
+ * It is initialized to \c NULL.
+ * hwloc will never modify it.
+ *
+ * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy().
+ *
+ * This pointer is not exported to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata);
+
+/** \brief Retrieve the topology-specific userdata pointer.
+ *
+ * Retrieve the application-given private data pointer that was
+ * previously set with hwloc_topology_set_userdata().
+ */
+HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_tinker Modifying a loaded Topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_topology_restrict(). */
+enum hwloc_restrict_flags_e {
+  /** \brief Remove all objects that became CPU-less.
+   * By default, only objects that contain no PU and no memory are removed.
+   * This flag may not be used with ::HWLOC_RESTRICT_FLAG_BYNODESET.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_REMOVE_CPULESS = (1UL<<0),
+
+  /** \brief Restrict by nodeset instead of CPU set.
+   * Only keep objects whose nodeset is included or partially included in the given set.
+   * This flag may not be used with ::HWLOC_RESTRICT_FLAG_REMOVE_CPULESS.
+   */
+  HWLOC_RESTRICT_FLAG_BYNODESET =  (1UL<<3),
+
+  /** \brief Remove all objects that became Memory-less.
+   * By default, only objects that contain no PU and no memory are removed.
+   * This flag may only be used with ::HWLOC_RESTRICT_FLAG_BYNODESET.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS = (1UL<<4),
+
+  /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, Misc objects are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1UL<<1),
+
+  /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1UL<<2)
+};
+
+/** \brief Restrict the topology to the given CPU set or nodeset.
+ *
+ * Topology \p topology is modified so as to remove all objects that
+ * are not included (or partially included) in the CPU set \p set.
+ * All objects CPU and node sets are restricted accordingly.
+ *
+ * If ::HWLOC_RESTRICT_FLAG_BYNODESET is passed in \p flags,
+ * \p set is considered a nodeset instead of a CPU set.
+ *
+ * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
+ *
+ * \note This call may not be reverted by restricting back to a larger
+ * set. Once dropped during restriction, objects may not be brought
+ * back, except by loading another topology with hwloc_topology_load().
+ *
+ * \return 0 on success.
+ *
+ * \return -1 with errno set to EINVAL if the input set is invalid.
+ * The topology is not modified in this case.
+ *
+ * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * The topology is reinitialized in this case. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ */
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_bitmap_t set, unsigned long flags);
+
+/** \brief Flags to be given to hwloc_topology_allow(). */
+enum hwloc_allow_flags_e {
+  /** \brief Mark all objects as allowed in the topology.
+   *
+   * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_ALL = (1UL<<0),
+
+  /** \brief Only allow objects that are available to the current process.
+   *
+   * The topology must have ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM so that the set
+   * of available resources can actually be retrieved from the operating system.
+   *
+   * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS = (1UL<<1),
+
+  /** \brief Allow a custom set of objects, given to hwloc_topology_allow() as \p cpuset and/or \p nodeset parameters.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_CUSTOM = (1UL<<2)
+};
+
+/** \brief Change the sets of allowed PUs and NUMA nodes in the topology.
+ *
+ * This function only works if the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
+ * was set on the topology. It does not modify any object, it only changes
+ * the sets returned by hwloc_topology_get_allowed_cpuset() and
+ * hwloc_topology_get_allowed_nodeset().
+ *
+ * It is notably useful when importing a topology from another process
+ * running in a different Linux Cgroup.
+ *
+ * \p flags must be set to one flag among ::hwloc_allow_flags_e.
+ *
+ * \note Removing objects from a topology should rather be performed with
+ * hwloc_topology_restrict().
+ */
+HWLOC_DECLSPEC int hwloc_topology_allow(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, hwloc_const_nodeset_t nodeset, unsigned long flags);
+
+/** \brief Add a MISC object as a leaf of the topology
+ *
+ * A new MISC object will be created and inserted into the topology at the
+ * position given by parent. It is appended to the list of existing Misc children,
+ * without ever adding any intermediate hierarchy level. This is useful for
+ * annotating the topology without actually changing the hierarchy.
+ *
+ * \p name is supposed to be unique across all Misc objects in the topology.
+ * It will be duplicated to setup the new object attributes.
+ *
+ * The new leaf object will not have any \p cpuset.
+ *
+ * \return the newly-created object
+ *
+ * \return \c NULL on error.
+ *
+ * \return \c NULL if Misc objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ *
+ * \note If \p name contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name);
+
+/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object().
+ *
+ * This function returns a new Group object.
+ *
+ * The caller should (at least) initialize its sets before inserting
+ * the object in the topology. See hwloc_topology_insert_group_object().
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
+
+/** \brief Add more structure to the topology by adding an intermediate Group
+ *
+ * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
+ * Then it must setup at least one of its CPU or node sets to specify
+ * the final location of the Group in the topology.
+ * Then the object can be passed to this function for actual insertion in the topology.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) must be set
+ * to a non-empty bitmap. The complete_cpuset or complete_nodeset may be set
+ * instead if inserting with respect to the complete topology
+ * (including disallowed, offline or unknown objects).
+ * If grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way
+ * to build the Group sets iteratively.
+ * These sets cannot be larger than the current topology, or they would get
+ * restricted silently.
+ * The core will setup the other sets after actual insertion.
+ *
+ * The \p subtype object attribute may be defined (to a dynamically
+ * allocated string) to display something else than "Group" as the
+ * type name for this object in lstopo.
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion.
+ *
+ * The group \p dont_merge attribute may be set to \c 1 to prevent
+ * the hwloc core from ever merging this object with another
+ * hierarchically-identical object.
+ * This is useful when the Group itself describes an important feature
+ * that cannot be exposed anywhere else in the hierarchy.
+ *
+ * The group \p kind attribute may be set to a high value such
+ * as \c 0xffffffff to tell hwloc that this new Group should always
+ * be discarded in favor of any existing Group with the same locality.
+ *
+ * \return The inserted object if it was properly inserted.
+ *
+ * \return An existing object if the Group was merged or discarded
+ * because the topology already contained an object at the same
+ * location (the Group did not add any hierarchy information).
+ *
+ * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
+ *
+ * \return \c NULL if Group objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ *
+ * \return \c NULL if the object was discarded because no set was
+ * initialized in the Group before insert, or all of them were empty.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
+/** \brief Setup object cpusets/nodesets by OR'ing another object's sets.
+ *
+ * For each defined cpuset or nodeset in \p src, allocate the corresponding set
+ * in \p dst and add \p src to it by OR'ing sets.
+ *
+ * This function is convenient between hwloc_topology_alloc_group_object()
+ * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
+ * that will be inserted as a new intermediate parent of several objects.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
+
+/** \brief Refresh internal structures after topology modification.
+ *
+ * Modifying the topology (by restricting, adding objects, modifying structures
+ * such as distances or memory attributes, etc.) may cause some internal caches
+ * to become invalid. These caches are automatically refreshed when accessed
+ * but this refreshing is not thread-safe.
+ *
+ * This function is not thread-safe either, but it is a good way to end a
+ * non-thread-safe phase of topology modification. Once this refresh is done,
+ * multiple threads may concurrently consult the topology, objects, distances,
+ * attributes, etc.
+ *
+ * See also \ref threadsafety
+ */
+HWLOC_DECLSPEC int hwloc_topology_refresh(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+/* high-level helpers */
+#include "hwloc/helper.h"
+
+/* inline code of some functions above */
+#include "hwloc/inlines.h"
+
+/* memory attributes */
+#include "hwloc/memattrs.h"
+
+/* kinds of CPU cores */
+#include "hwloc/cpukinds.h"
+
+/* exporting to XML or synthetic */
+#include "hwloc/export.h"
+
+/* distances */
+#include "hwloc/distances.h"
+
+/* topology diffs */
+#include "hwloc/diff.h"
+
+/* deprecated headers */
+#include "hwloc/deprecated.h"
+
+#endif /* HWLOC_H */
diff --git a/deps/hwloc/include/hwloc/autogen/config.h b/deps/hwloc/include/hwloc/autogen/config.h
new file mode 100644
index 000000000..951fec8c2
--- /dev/null
+++ b/deps/hwloc/include/hwloc/autogen/config.h
@@ -0,0 +1,233 @@
+/* include/hwloc/autogen/config.h.  Generated from config.h.in by configure.  */
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2020 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#define HWLOC_VERSION "2.5.0a1-git"
+#define HWLOC_VERSION_MAJOR 2
+#define HWLOC_VERSION_MINOR 5
+#define HWLOC_VERSION_RELEASE 0
+#define HWLOC_VERSION_GREEK "a1"
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+#  define __hwloc_restrict restrict
+# else
+#  define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define __hwloc_inline __inline
+#else
+#  define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public.  We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+#if !defined(__cplusplus) &&					\
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#define GCC_ABOVE_3_4 1
+#else
+#define GCC_ABOVE_3_4 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifndef __hwloc_attribute_deprecated /* allow the user to disable these warnings by defining this macro to nothing */
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+#define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT (GXX_ABOVE_3_4 || GCC_ABOVE_3_4)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+# define __hwloc_attribute_warn_unused_result __attribute__((__warn_unused_result__))
+#else
+# define __hwloc_attribute_warn_unused_result
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+#  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+#  define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Defined to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Defined to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+#define hwloc_pid_t pid_t
+#define hwloc_thread_t pthread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+#  include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+#  ifdef hwloc_thread_t
+#    include <pthread.h>
+#  endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+#  define HWLOC_HAVE_STDINT_H 1
+
+#  include <unistd.h>
+#  ifdef HWLOC_HAVE_STDINT_H
+#    include <stdint.h>
+#  endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if --enable-32bits-pci-domain is called. */
+/* #undef HWLOC_HAVE_32BITS_PCI_DOMAIN */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 0
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX hwloc_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS HWLOC_
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/deps/hwloc/include/hwloc/bitmap.h b/deps/hwloc/include/hwloc/bitmap.h
new file mode 100644
index 000000000..8d9bb9c88
--- /dev/null
+++ b/deps/hwloc/include/hwloc/bitmap.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2020 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief The bitmap API, for use in hwloc itself.
+ */
+
+#ifndef HWLOC_BITMAP_H
+#define HWLOC_BITMAP_H
+
+#include "hwloc/autogen/config.h"
+
+#include <assert.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_bitmap The bitmap API
+ *
+ * The ::hwloc_bitmap_t type represents a set of integers (positive or null).
+ * A bitmap may be of infinite size (all bits are set after some point).
+ * A bitmap may even be full if all bits are set.
+ *
+ * Bitmaps are used by hwloc for sets of OS processors
+ * (which may actually be hardware threads) as by ::hwloc_cpuset_t
+ * (a typedef for ::hwloc_bitmap_t), or sets of NUMA memory nodes
+ * as ::hwloc_nodeset_t (also a typedef for ::hwloc_bitmap_t).
+ * Those are used for cpuset and nodeset fields in the ::hwloc_obj structure,
+ * see \ref hwlocality_object_sets.
+ *
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ * However users should usually not build CPU and node sets manually
+ * (e.g. with hwloc_bitmap_set()).
+ * One should rather use existing object sets and combine them with
+ * hwloc_bitmap_or(), etc.
+ * For instance, binding the current thread on a pair of cores may be performed with:
+ * \code
+ * hwloc_obj_t core1 = ... , core2 = ... ;
+ * hwloc_bitmap_t set = hwloc_bitmap_alloc();
+ * hwloc_bitmap_or(set, core1->cpuset, core2->cpuset);
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD);
+ * hwloc_bitmap_free(set);
+ * \endcode
+ *
+ * \note Most functions below return an int that may be negative in case of
+ * error. The usual error case would be an internal failure to realloc/extend
+ * the storage of the bitmap (\p errno would be set to \c ENOMEM).
+ *
+ * \note Several examples of using the bitmap API are available under the
+ * doc/examples/ directory in the source tree.
+ * Regression tests such as tests/hwloc/hwloc_bitmap*.c also make intensive use
+ * of this API.
+ * @{
+ */
+
+
+/** \brief
+ * Set of bits represented as an opaque pointer to an internal bitmap.
+ */
+typedef struct hwloc_bitmap_s * hwloc_bitmap_t;
+/** \brief a non-modifiable ::hwloc_bitmap_t */
+typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
+
+
+/*
+ * Bitmap allocation, freeing and copying.
+ */
+
+/** \brief Allocate a new empty bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
+
+/** \brief Allocate a new full bitmap. */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
+
+/** \brief Free bitmap \p bitmap.
+ *
+ * If \p bitmap is \c NULL, no operation is performed.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
+
+/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents.
+ *
+ * If \p bitmap is \c NULL, \c NULL is returned.
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
+
+/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
+HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+
+
+/*
+ * Bitmap/String Conversion
+ */
+
+/** \brief Stringify a bitmap.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the list format.
+ *
+ * Lists are comma-separated indexes or ranges.
+ * Ranges are dash separated indexes.
+ * The last range may not have an ending indexes if the bitmap is infinitely set.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated list string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a list string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the taskset-specific format.
+ *
+ * The taskset command manipulates bitmap strings that contain a single
+ * (possible very long) hexadecimal number starting with 0x.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ *
+ * \return -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+
+/*
+ * Building bitmaps.
+ */
+
+/** \brief Empty the bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
+
+/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */
+HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
+
+/** \brief Empty the bitmap \p bitmap and add bit \p id */
+HWLOC_DECLSPEC int hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Fill the bitmap \p and clear the index \p id */
+HWLOC_DECLSPEC int hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned longs \p masks used as first \p nr subsets */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ulongs(hwloc_bitmap_t bitmap, unsigned nr, const unsigned long *masks);
+
+
+/*
+ * Modifying bitmaps.
+ */
+
+/** \brief Add index \p id in bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
+HWLOC_DECLSPEC int hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Remove index \p id from bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Keep a single index among those set in bitmap \p bitmap
+ *
+ * May be useful before binding so that the process does not
+ * have a chance of migrating between multiple processors
+ * in the original mask.
+ * Instead of running the task on any PU inside the given CPU set,
+ * the operating system scheduler will be forced to run it on a single
+ * of these PUs.
+ * It avoids a migration overhead and cache-line ping-pongs between PUs.
+ *
+ * \note This function is NOT meant to distribute multiple processes
+ * within a single CPU set. It always return the same single bit when
+ * called multiple times on the same input set. hwloc_distrib() may
+ * be used for generating CPU sets to distribute multiple tasks below
+ * a single multi-PU object.
+ *
+ * \note This function cannot be applied to an object set directly. It
+ * should be applied to a copy (which may be obtained with hwloc_bitmap_dup()).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+
+
+/*
+ * Consulting bitmaps.
+ */
+
+/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
+
+/** \brief Convert the first \p nr subsets of bitmap \p bitmap into the array of \p nr unsigned long \p masks
+ *
+ * \p nr may be determined earlier with hwloc_bitmap_nr_ulongs().
+ *
+ * \return 0
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_to_ulongs(hwloc_const_bitmap_t bitmap, unsigned nr, unsigned long *masks);
+
+/** \brief Return the number of unsigned longs required for storing bitmap \p bitmap entirely
+ *
+ * This is the number of contiguous unsigned longs from the very first bit of the bitmap
+ * (even if unset) up to the last set bit.
+ * This is useful for knowing the \p nr parameter to pass to hwloc_bitmap_to_ulongs()
+ * (or which calls to hwloc_bitmap_to_ith_ulong() are needed)
+ * to entirely convert a bitmap into multiple unsigned longs.
+ *
+ * When called on the output of hwloc_topology_get_topology_cpuset(),
+ * the returned number is large enough for all cpusets of the topology.
+ *
+ * \return -1 if \p bitmap is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_nr_ulongs(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap.
+ *
+ * \return 1 if the bit at index \p id is set in bitmap \p bitmap, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is empty
+ *
+ * \return 1 if bitmap is empty, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is completely full
+ *
+ * \return 1 if bitmap is full, 0 otherwise.
+ *
+ * \note A full bitmap is always infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first index is returned.
+ *
+ * \return -1 if no index with higher index is set in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set in \p bitmap, or if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of
+ * indexes that are in the bitmap).
+ *
+ * \return the number of indexes that are in the bitmap.
+ *
+ * \return -1 if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first unset index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next unset index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first unset index is returned.
+ *
+ * \return -1 if no index with higher index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next_unset(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last unset index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap, or if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Loop macro iterating on bitmap \p bitmap
+ *
+ * The loop must start with hwloc_bitmap_foreach_begin() and end
+ * with hwloc_bitmap_foreach_end() followed by a terminating ';'.
+ *
+ * \p index is the loop variable; it should be an unsigned int.  The
+ * first iteration will set \p index to the lowest index in the bitmap.
+ * Successive iterations will iterate through, in order, all remaining
+ * indexes set in the bitmap.  To be specific: each iteration will return a
+ * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
+ *
+ * The assert prevents the loop from being infinite if the bitmap is infinitely set.
+ *
+ * \hideinitializer
+ */
+#define hwloc_bitmap_foreach_begin(id, bitmap) \
+do { \
+        assert(hwloc_bitmap_weight(bitmap) != -1); \
+        for (id = hwloc_bitmap_first(bitmap); \
+             (unsigned) id != (unsigned) -1; \
+             id = hwloc_bitmap_next(bitmap, id)) {
+
+/** \brief End of loop macro iterating on a bitmap.
+ *
+ * Needs a terminating ';'.
+ *
+ * \sa hwloc_bitmap_foreach_begin()
+ * \hideinitializer
+ */
+#define hwloc_bitmap_foreach_end()		\
+        } \
+} while (0)
+
+
+/*
+ * Combining bitmaps.
+ */
+
+/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+
+
+/*
+ * Comparing bitmaps.
+ */
+
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects.
+ *
+ * \return 1 if bitmaps intersect, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap.
+ *
+ * \return 1 if \p sub_bitmap is included in \p super_bitmap, 0 otherwise.
+ *
+ * \note The empty bitmap is considered included in any other bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2.
+ *
+ * \return 1 if bitmaps are equal, 0 otherwise.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
+ *
+ * A bitmap is considered smaller if its least significant bit is smaller.
+ * The empty bitmap is considered higher than anything (because its least significant bit does not exist).
+ *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110)
+ * because least significant bit of 0011 (0001) is smaller than least significant bit of 0110 (0010).
+ * Comparing 01001 and 00110 would also return -1 for the same reason.
+ *
+ * \return 0 if bitmaps are considered equal, even if they are not strictly equal.
+ * They just need to have the same least significant bit.
+ * For instance, comparing binary bitmaps 0010 and 0110 returns 0 because they have the same least significant bit.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order.
+ *
+ * Lexicographic comparison of bitmaps, starting for their highest indexes.
+ * Compare last indexes first, then second, etc.
+ * The empty bitmap is considered lower than anything.
+ *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ * \return 0 if bitmaps are equal (contrary to hwloc_bitmap_compare_first()).
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110).
+ * Comparing 00101 and 01010 returns -1 too.
+ *
+ * \note This is different from the non-existing hwloc_bitmap_compare_last()
+ * which would only compare the highest index of each bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_BITMAP_H */
diff --git a/deps/hwloc/include/hwloc/cpukinds.h b/deps/hwloc/include/hwloc/cpukinds.h
new file mode 100644
index 000000000..f240baf39
--- /dev/null
+++ b/deps/hwloc/include/hwloc/cpukinds.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2020 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Kinds of CPU cores.
+ */
+
+#ifndef HWLOC_CPUKINDS_H
+#define HWLOC_CPUKINDS_H
+
+#include "hwloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+/** \defgroup hwlocality_cpukinds Kinds of CPU cores
+ *
+ * Platforms with heterogeneous CPUs may have some cores with
+ * different features or frequencies.
+ * This API exposes identical PUs in sets called CPU kinds.
+ * Each PU of the topology may only be in a single kind.
+ *
+ * The number of kinds may be obtained with hwloc_cpukinds_get_nr().
+ * If the platform is homogeneous, there may be a single kind
+ * with all PUs.
+ * If the platform or operating system does not expose any
+ * information about CPU cores, there may be no kind at all.
+ *
+ * The index of the kind that describes a given CPU set
+ * (if any, and not partially)
+ * may be obtained with hwloc_cpukinds_get_by_cpuset().
+ *
+ * From the index of a kind, it is possible to retrieve information
+ * with hwloc_cpukinds_get_info():
+ * an abstracted efficiency value,
+ * and an array of info attributes
+ * (for instance the "CoreType" and "FrequencyMaxMHz",
+ *  see \ref topoattrs_cpukinds).
+ *
+ * A higher efficiency value means intrinsic greater performance
+ * (and possibly less performance/power efficiency).
+ * Kinds with lower efficiency are ranked first:
+ * Passing 0 as \p kind_index to hwloc_cpukinds_get_info() will
+ * return information about the less efficient CPU kind.
+ *
+ * When available, efficiency values are gathered from the operating
+ * system (when \p cpukind_efficiency is set in the
+ * struct hwloc_topology_discovery_support array, only on Windows 10 for now).
+ * Otherwise hwloc tries to compute efficiencies
+ * by comparing CPU kinds using frequencies (on ARM),
+ * or core types and frequencies (on other architectures).
+ * The environment variable HWLOC_CPUKINDS_RANKING may be used
+ * to change this heuristics, see \ref envvar.
+ *
+ * If hwloc fails to rank any kind, for instance because the operating
+ * system does not expose efficiencies and core frequencies,
+ * all kinds will have an unknown efficiency (\c -1),
+ * and they are not indexed/ordered in any specific way.
+ *
+ * @{
+ */
+
+/** \brief Get the number of different kinds of CPU cores in the topology.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return The number of CPU kinds (positive integer) on success.
+ * \return \c 0 if no information about kinds was found.
+ * \return \c -1 with \p errno set to \c EINVAL if \p flags is invalid.
+ */
+HWLOC_DECLSPEC int
+hwloc_cpukinds_get_nr(hwloc_topology_t topology,
+                      unsigned long flags);
+
+/** \brief Get the index of the CPU kind that contains CPUs listed in \p cpuset.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return The index of the CPU kind (positive integer or 0) on success.
+ * \return \c -1 with \p errno set to \c EXDEV if \p cpuset is
+ * only partially included in the some kind.
+ * \return \c -1 with \p errno set to \c ENOENT if \p cpuset is
+ * not included in any kind, even partially.
+ * \return \c -1 with \p errno set to \c EINVAL if parameters are invalid.
+ */
+HWLOC_DECLSPEC int
+hwloc_cpukinds_get_by_cpuset(hwloc_topology_t topology,
+                             hwloc_const_bitmap_t cpuset,
+                             unsigned long flags);
+
+/** \brief Get the CPU set and infos about a CPU kind in the topology.
+ *
+ * \p kind_index identifies one kind of CPU between 0 and the number
+ * of kinds returned by hwloc_cpukinds_get_nr() minus 1.
+ *
+ * If not \c NULL, the bitmap \p cpuset will be filled with
+ * the set of PUs of this kind.
+ *
+ * The integer pointed by \p efficiency, if not \c NULL will, be filled
+ * with the ranking of this kind of CPU in term of efficiency (see above).
+ * It ranges from \c 0 to the number of kinds
+ * (as reported by hwloc_cpukinds_get_nr()) minus 1.
+ *
+ * Kinds with lower efficiency are reported first.
+ *
+ * If there is a single kind in the topology, its efficiency \c 0.
+ * If the efficiency of some kinds of cores is unknown,
+ * the efficiency of all kinds is set to \c -1,
+ * and kinds are reported in no specific order.
+ *
+ * The array of info attributes (for instance the "CoreType",
+ * "FrequencyMaxMHz" or "FrequencyBaseMHz", see \ref topoattrs_cpukinds)
+ * and its length are returned in \p infos or \p nr_infos.
+ * The array belongs to the topology, it should not be freed or modified.
+ *
+ * If \p nr_infos or \p infos is \c NULL, no info is returned.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return \c 0 on success.
+ * \return \c -1 with \p errno set to \c ENOENT if \p kind_index does not match any CPU kind.
+ * \return \c -1 with \p errno set to \c EINVAL if parameters are invalid.
+ */
+HWLOC_DECLSPEC int
+hwloc_cpukinds_get_info(hwloc_topology_t topology,
+                        unsigned kind_index,
+                        hwloc_bitmap_t cpuset,
+                        int *efficiency,
+                        unsigned *nr_infos, struct hwloc_info_s **infos,
+                        unsigned long flags);
+
+/** \brief Register a kind of CPU in the topology.
+ *
+ * Mark the PUs listed in \p cpuset as being of the same kind
+ * with respect to the given attributes.
+ *
+ * \p forced_efficiency should be \c -1 if unknown.
+ * Otherwise it is an abstracted efficiency value to enforce
+ * the ranking of all kinds if all of them have valid (and
+ * different) efficiencies.
+ *
+ * The array \p infos of size \p nr_infos may be used to provide
+ * info names and values describing this kind of PUs.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * Parameters \p cpuset and \p infos will be duplicated internally,
+ * the caller is responsible for freeing them.
+ *
+ * If \p cpuset overlaps with some existing kinds, those might get
+ * modified or split. For instance if existing kind A contains
+ * PUs 0 and 1, and one registers another kind for PU 1 and 2,
+ * there will be 3 resulting kinds:
+ * existing kind A is restricted to only PU 0;
+ * new kind B contains only PU 1 and combines information from A
+ * and from the newly-registered kind;
+ * new kind C contains only PU 2 and only gets information from
+ * the newly-registered kind.
+ *
+ * \note The efficiency \p forced_efficiency provided to this function
+ * may be different from the one reported later by hwloc_cpukinds_get_info()
+ * because hwloc will scale efficiency values down to
+ * between 0 and the number of kinds minus 1.
+ *
+ * \return \c 0 on success.
+ * \return \c -1 with \p errno set to \c EINVAL if some parameters are invalid,
+ * for instance if \p cpuset is \c NULL or empty.
+ */
+HWLOC_DECLSPEC int
+hwloc_cpukinds_register(hwloc_topology_t topology,
+                        hwloc_bitmap_t cpuset,
+                        int forced_efficiency,
+                        unsigned nr_infos, struct hwloc_info_s *infos,
+                        unsigned long flags);
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CPUKINDS_H */
diff --git a/deps/hwloc/include/hwloc/deprecated.h b/deps/hwloc/include/hwloc/deprecated.h
new file mode 100644
index 000000000..f2419dd48
--- /dev/null
+++ b/deps/hwloc/include/hwloc/deprecated.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2021 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_DEPRECATED_H
+#define HWLOC_DEPRECATED_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* backward compat with v2.0 before WHOLE_SYSTEM renaming */
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
+/* backward compat with v1.11 before System removal */
+#define HWLOC_OBJ_SYSTEM HWLOC_OBJ_MACHINE
+/* backward compat with v1.10 before Socket->Package renaming */
+#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
+/* backward compat with v1.10 before Node->NUMANode clarification */
+#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
+
+/** \brief Add a distances structure.
+ *
+ * Superseded by hwloc_distances_add_create()+hwloc_distances_add_values()+hwloc_distances_add_commit()
+ * in v2.5.
+ */
+HWLOC_DECLSPEC int hwloc_distances_add(hwloc_topology_t topology,
+				       unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values,
+				       unsigned long kind, unsigned long flags) __hwloc_attribute_deprecated;
+
+/** \brief Insert a misc object by parent.
+ *
+ * Identical to hwloc_topology_insert_misc_object().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+  return hwloc_topology_insert_misc_object(topology, parent, name);
+}
+
+/** \brief Stringify the cpuset containing a set of objects.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+  hwloc_bitmap_t set = hwloc_bitmap_alloc();
+  int res;
+  unsigned i;
+
+  hwloc_bitmap_zero(set);
+  for(i=0; i<nobj; i++)
+    if (objs[i]->cpuset)
+      hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+  res = hwloc_bitmap_snprintf(str, size, set);
+  hwloc_bitmap_free(set);
+  return res;
+}
+
+/** \brief Convert a type string into a type and some attributes.
+ *
+ * Deprecated by hwloc_type_sscanf()
+ */
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+{
+  union hwloc_obj_attr_u attr;
+  int err = hwloc_type_sscanf(string, typep, &attr, sizeof(attr));
+  if (err < 0)
+    return err;
+  if (hwloc_obj_type_is_cache(*typep)) {
+    if (depthattrp)
+      *depthattrp = (int) attr.cache.depth;
+    if (typeattrp && typeattrsize >= sizeof(hwloc_obj_cache_type_t))
+      memcpy(typeattrp, &attr.cache.type, sizeof(hwloc_obj_cache_type_t));
+  } else if (*typep == HWLOC_OBJ_GROUP) {
+    if (depthattrp)
+      *depthattrp = (int) attr.group.depth;
+  }
+  return 0;
+}
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ */
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ */
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ */
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ */
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given nodeset \p nodeset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind_policy(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+  hwloc_cpuset_to_nodeset(topology, _cpuset, nodeset);
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+  hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset);
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DEPRECATED_H */
diff --git a/deps/hwloc/include/hwloc/diff.h b/deps/hwloc/include/hwloc/diff.h
new file mode 100644
index 000000000..0ad0486be
--- /dev/null
+++ b/deps/hwloc/include/hwloc/diff.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2013-2020 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Topology differences.
+ */
+
+#ifndef HWLOC_DIFF_H
+#define HWLOC_DIFF_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_diff Topology differences
+ *
+ * Applications that manipulate many similar topologies, for instance
+ * one for each node of a homogeneous cluster, may want to compress
+ * topologies to reduce the memory footprint.
+ *
+ * This file offers a way to manipulate the difference between topologies
+ * and export/import it to/from XML.
+ * Compression may therefore be achieved by storing one topology
+ * entirely while the others are only described by their differences
+ * with the former.
+ * The actual topology can be reconstructed when actually needed by
+ * applying the precomputed difference to the reference topology.
+ *
+ * This interface targets very similar nodes.
+ * Only very simple differences between topologies are actually
+ * supported, for instance a change in the memory size, the name
+ * of the object, or some info attribute.
+ * More complex differences such as adding or removing objects cannot
+ * be represented in the difference structures and therefore return
+ * errors.
+ * Differences between object sets or topology-wide allowed sets,
+ * cannot be represented either.
+ *
+ * It means that there is no need to apply the difference when
+ * looking at the tree organization (how many levels, how many
+ * objects per level, what kind of objects, CPU and node sets, etc)
+ * and when binding to objects.
+ * However the difference must be applied when looking at object
+ * attributes such as the name, the memory size or info attributes.
+ *
+ * @{
+ */
+
+
+/** \brief Type of one object attribute difference.
+ */
+typedef enum hwloc_topology_diff_obj_attr_type_e {
+  /** \brief The object local memory is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_uint64_s
+   * (and the index field is ignored).
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+
+  /** \brief The object name is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s
+   * (and the name field is ignored).
+   */
+
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+  /** \brief the value of an info attribute is modified.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
+} hwloc_topology_diff_obj_attr_type_t;
+
+/** \brief One object attribute difference.
+ */
+union hwloc_topology_diff_obj_attr_u {
+  struct hwloc_topology_diff_obj_attr_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_obj_attr_type_t type;
+  } generic;
+
+  /** \brief Integer attribute modification with an optional index. */
+  struct hwloc_topology_diff_obj_attr_uint64_s {
+    /* used for storing integer attributes */
+    hwloc_topology_diff_obj_attr_type_t type;
+    hwloc_uint64_t index; /* not used for SIZE */
+    hwloc_uint64_t oldvalue;
+    hwloc_uint64_t newvalue;
+  } uint64;
+
+  /** \brief String attribute modification with an optional name */
+  struct hwloc_topology_diff_obj_attr_string_s {
+    /* used for storing name and info pairs */
+    hwloc_topology_diff_obj_attr_type_t type;
+    char *name; /* not used for NAME */
+    char *oldvalue;
+    char *newvalue;
+  } string;
+};
+
+
+/** \brief Type of one element of a difference list.
+ */
+typedef enum hwloc_topology_diff_type_e {
+  /** \brief An object attribute was changed.
+   * The union is a hwloc_topology_diff_u::hwloc_topology_diff_obj_attr_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
+
+  /** \brief The difference is too complex,
+   * it cannot be represented. The difference below
+   * this object has not been checked.
+   * hwloc_topology_diff_build() will return 1.
+   *
+   * The union is a hwloc_topology_diff_u::hwloc_topology_diff_too_complex_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+} hwloc_topology_diff_type_t;
+
+/** \brief One element of a difference list between two topologies.
+ */
+typedef union hwloc_topology_diff_u {
+  struct hwloc_topology_diff_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_type_t type;
+    union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */
+  } generic;
+
+  /* A difference in an object attribute. */
+  struct hwloc_topology_diff_obj_attr_s {
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+    union hwloc_topology_diff_u * next;
+    /* List of attribute differences for a single object */
+    int obj_depth;
+    unsigned obj_index;
+    union hwloc_topology_diff_obj_attr_u diff;
+  } obj_attr;
+
+  /* A difference that is too complex. */
+  struct hwloc_topology_diff_too_complex_s {
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+    union hwloc_topology_diff_u * next;
+    /* Where we had to stop computing the diff in the first topology */
+    int obj_depth;
+    unsigned obj_index;
+  } too_complex;
+} * hwloc_topology_diff_t;
+
+
+/** \brief Compute the difference between 2 topologies.
+ *
+ * The difference is stored as a list of ::hwloc_topology_diff_t entries
+ * starting at \p diff.
+ * It is computed by doing a depth-first traversal of both topology trees
+ * simultaneously.
+ *
+ * If the difference between 2 objects is too complex to be represented
+ * (for instance if some objects have different types, or different numbers
+ * of children), a special diff entry of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * is queued.
+ * The computation of the diff does not continue below these objects.
+ * So each such diff entry means that the difference between two subtrees
+ * could not be computed.
+ *
+ * \return 0 if the difference can be represented properly.
+ *
+ * \return 0 with \p diff pointing to NULL if there is no difference
+ * between the topologies.
+ *
+ * \return 1 if the difference is too complex (see above). Some entries in
+ * the list will be of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ *
+ * \return -1 on any other error.
+ *
+ * \note \p flags is currently not used. It should be 0.
+ *
+ * \note The output diff has to be freed with hwloc_topology_diff_destroy().
+ *
+ * \note The output diff can only be exported to XML or passed to
+ * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
+ * ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ *
+ * \note The output diff may be modified by removing some entries from
+ * the list. The removed entries should be freed by passing them to
+ * to hwloc_topology_diff_destroy() (possible as another list).
+*/
+HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff);
+
+/** \brief Flags to be given to hwloc_topology_diff_apply().
+ */
+enum hwloc_topology_diff_apply_flags_e {
+  /** \brief Apply topology diff in reverse direction.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0)
+};
+
+/** \brief Apply a topology diff to an existing topology.
+ *
+ * \p flags is an OR'ed set of ::hwloc_topology_diff_apply_flags_e.
+ *
+ * The new topology is modified in place. hwloc_topology_dup()
+ * may be used to duplicate it before patching.
+ *
+ * If the difference cannot be applied entirely, all previous applied
+ * elements are unapplied before returning.
+ *
+ * \return 0 on success.
+ *
+ * \return -N if applying the difference failed while trying
+ * to apply the N-th part of the difference. For instance -1
+ * is returned if the very first difference element could not
+ * be applied.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
+
+/** \brief Destroy a list of topology differences.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff);
+
+/** \brief Load a list of topology differences from a XML file.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML file.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+
+/** \brief Load a list of topology differences from a XML buffer.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+  */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML buffer.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
+ *
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DIFF_H */
diff --git a/deps/hwloc/include/hwloc/distances.h b/deps/hwloc/include/hwloc/distances.h
new file mode 100644
index 000000000..c12856cd8
--- /dev/null
+++ b/deps/hwloc/include/hwloc/distances.h
@@ -0,0 +1,447 @@
+/*
+ * Copyright © 2010-2021 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Object distances.
+ */
+
+#ifndef HWLOC_DISTANCES_H
+#define HWLOC_DISTANCES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_distances_get Retrieve distances between objects
+ * @{
+ */
+
+/** \brief Matrix of distances between a set of objects.
+ *
+ * This matrix often contains latencies between NUMA nodes
+ * (as reported in the System Locality Distance Information Table (SLIT)
+ * in the ACPI specification), which may or may not be physically accurate.
+ * It corresponds to the latency for accessing the memory of one node
+ * from a core in another node.
+ * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
+ * The name of this distances structure is "NUMALatency".
+ * Others distance structures include and "XGMIBandwidth" and "NVLinkBandwidth".
+ *
+ * The matrix may also contain bandwidths between random sets of objects,
+ * possibly provided by the user, as specified in the \p kind attribute.
+ *
+ * Pointers \p objs and \p values should not be replaced, reallocated, freed, etc.
+ * However callers are allowed to modify \p kind as well as the contents
+ * of \p objs and \p values arrays.
+ * For instance, if there is a single NUMA node per Package,
+ * hwloc_get_obj_with_same_locality() may be used to convert between them
+ * and replace NUMA nodes in the \p objs array with the corresponding Packages.
+ */
+struct hwloc_distances_s {
+  unsigned nbobjs;		/**< \brief Number of objects described by the distance matrix. */
+  hwloc_obj_t *objs;		/**< \brief Array of objects described by the distance matrix.
+				 * These objects are not in any particular order,
+				 * see hwloc_distances_obj_index() and hwloc_distances_obj_pair_values()
+				 * for easy ways to find objects in this array and their corresponding values.
+				 */
+  unsigned long kind;		/**< \brief OR'ed set of ::hwloc_distances_kind_e. */
+  hwloc_uint64_t *values;	/**< \brief Matrix of distances between objects, stored as a one-dimension array.
+				 *
+				 * Distance from i-th to j-th object is stored in slot i*nbobjs+j.
+				 * The meaning of the value depends on the \p kind attribute.
+				 */
+};
+
+/** \brief Kinds of distance matrices.
+ *
+ * The \p kind attribute of struct hwloc_distances_s is a OR'ed set
+ * of kinds.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_FROM_* specifies where the
+ * distance information comes from, if known.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_MEANS_* specifies whether
+ * values are latencies or bandwidths, if applicable.
+ */
+enum hwloc_distances_kind_e {
+  /** \brief These distances were obtained from the operating system or hardware.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_OS = (1UL<<0),
+  /** \brief These distances were provided by the user.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_USER = (1UL<<1),
+
+  /** \brief Distance values are similar to latencies between objects.
+   * Values are smaller for closer objects, hence minimal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * It could also be the number of network hops between objects, etc.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_LATENCY = (1UL<<2),
+  /** \brief Distance values are similar to bandwidths between objects.
+   * Values are higher for closer objects, hence maximal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * Such values are currently ignored for distance-based grouping.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH = (1UL<<3),
+
+  /** \brief This distances structure covers objects of different types.
+   * This may apply to the "NVLinkBandwidth" structure in presence
+   * of a NVSwitch or POWER processor NVLink port.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES = (1UL<<4)
+};
+
+/** \brief Retrieve distance matrices.
+ *
+ * Retrieve distance matrices from the topology into the \p distances array.
+ *
+ * \p flags is currently unused, should be \c 0.
+ *
+ * \p kind serves as a filter. If \c 0, all distance matrices are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_FROM_*, only distance matrices
+ * whose kind matches one of these are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_MEANS_*, only distance matrices
+ * whose kind matches one of these are returned.
+ *
+ * On input, \p nr points to the number of distance matrices that may be stored
+ * in \p distances.
+ * On output, \p nr points to the number of distance matrices that were actually
+ * found, even if some of them couldn't be stored in \p distances.
+ * Distance matrices that couldn't be stored are ignored, but the function still
+ * returns success (\c 0). The caller may find out by comparing the value pointed
+ * by \p nr before and after the function call.
+ *
+ * Each distance matrix returned in the \p distances array should be released
+ * by the caller using hwloc_distances_release().
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get(hwloc_topology_t topology,
+		    unsigned *nr, struct hwloc_distances_s **distances,
+		    unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_get() with the additional \p depth filter.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
+			     unsigned *nr, struct hwloc_distances_s **distances,
+			     unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object of a specific type.
+ *
+ * Identical to hwloc_distances_get() with the additional \p type filter.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
+			    unsigned *nr, struct hwloc_distances_s **distances,
+			    unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve a distance matrix with the given name.
+ *
+ * Usually only one distances structure may match a given name.
+ *
+ * The name of the most common structure is "NUMALatency".
+ * Others include "XGMIBandwidth" and "NVLinkBandwidth".
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
+			    unsigned *nr, struct hwloc_distances_s **distances,
+			    unsigned long flags);
+
+/** \brief Get a description of what a distances structure contains.
+ *
+ * For instance "NUMALatency" for hardware-provided NUMA distances (ACPI SLIT),
+ * or NULL if unknown.
+ */
+HWLOC_DECLSPEC const char *
+hwloc_distances_get_name(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** \brief Release a distance matrix structure previously returned by hwloc_distances_get().
+ *
+ * \note This function is not required if the structure is removed with hwloc_distances_release_remove().
+ */
+HWLOC_DECLSPEC void
+hwloc_distances_release(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** \brief Transformations of distances structures. */
+enum hwloc_distances_transform_e {
+  /** \brief Remove \c NULL objects from the distances structure.
+   *
+   * Every object that was replaced with \c NULL in the \p objs array
+   * is removed and the \p values array is updated accordingly.
+   *
+   * At least \c 2 objects must remain, otherwise hwloc_distances_transform()
+   * will return \c -1 with \p errno set to \c EINVAL.
+   *
+   * \p kind will be updated with or without ::HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES
+   * according to the remaining objects.
+   *
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL = 0,
+
+  /** \brief Replace bandwidth values with a number of links.
+   *
+   * Usually all values will be either \c 0 (no link) or \c 1 (one link).
+   * However some matrices could get larger values if some pairs of
+   * peers are connected by different numbers of links.
+   *
+   * Values on the diagonal are set to \c 0.
+   *
+   * This transformation only applies to bandwidth matrices.
+   *
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_TRANSFORM_LINKS = 1
+};
+
+/** \brief Apply a transformation to a distances structure.
+ *
+ * Modify a distances structure that was previously obtained with
+ * hwloc_distances_get() or one of its variants.
+ *
+ * This modifies the local copy of the distances structures but does
+ * not modify the distances information stored inside the topology
+ * (retrieved by another call to hwloc_distances_get() or exported to XML).
+ * To do so, one should add a new distances structure with same
+ * name, kind, objects and values (see \ref hwlocality_distances_add)
+ * and then remove this old one with hwloc_distances_release_remove().
+ *
+ * \p transform must be one of the transformations listed
+ * in ::hwloc_distances_transform_e.
+ *
+ * These transformations may modify the contents of the \p objs or \p values arrays.
+ *
+ * \p transform_attr must be \c NULL for now.
+ *
+ * \p flags must be \c 0 for now.
+ */
+HWLOC_DECLSPEC int hwloc_distances_transform(hwloc_topology_t topology, struct hwloc_distances_s *distances,
+                                             enum hwloc_distances_transform_e transform,
+                                             void *transform_attr,
+                                             unsigned long flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_consult Helpers for consulting distance matrices
+ * @{
+ */
+
+/** \brief Find the index of an object in a distances structure.
+ *
+ * \return -1 if object \p obj is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_index(struct hwloc_distances_s *distances, hwloc_obj_t obj)
+{
+  unsigned i;
+  for(i=0; i<distances->nbobjs; i++)
+    if (distances->objs[i] == obj)
+      return (int)i;
+  return -1;
+}
+
+/** \brief Find the values between two objects in a distance matrices.
+ *
+ * The distance from \p obj1 to \p obj2 is stored in the value pointed by
+ * \p value1to2 and reciprocally.
+ *
+ * \return -1 if object \p obj1 or \p obj2 is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_pair_values(struct hwloc_distances_s *distances,
+				hwloc_obj_t obj1, hwloc_obj_t obj2,
+				hwloc_uint64_t *value1to2, hwloc_uint64_t *value2to1)
+{
+  int i1 = hwloc_distances_obj_index(distances, obj1);
+  int i2 = hwloc_distances_obj_index(distances, obj2);
+  if (i1 < 0 || i2 < 0)
+    return -1;
+  *value1to2 = distances->values[i1 * distances->nbobjs + i2];
+  *value2to1 = distances->values[i2 * distances->nbobjs + i1];
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_add Add distances between objects
+ *
+ * The usual way to add distances is:
+ * \code
+ * hwloc_distances_add_handle_t handle;
+ * int err = -1;
+ * handle = hwloc_distances_add_create(topology, "name", kind, 0);
+ * if (handle) {
+ *   err = hwloc_distances_add_values(topology, handle, nbobjs, objs, values, 0);
+ *   if (!err)
+ *     err = hwloc_distances_add_commit(topology, handle, flags);
+ * }
+ * \endcode
+ * If \p err is \c 0 at the end, then addition was successful.
+ *
+ * @{
+ */
+
+/** \brief Handle to a new distances structure during its addition to the topology. */
+typedef void * hwloc_distances_add_handle_t;
+
+/** \brief Create a new empty distances structure.
+ *
+ * Create an empty distances structure
+ * to be filled with hwloc_distances_add_values()
+ * and then committed with hwloc_distances_add_commit().
+ *
+ * Parameter \p name is optional, it may be \c NULL.
+ * Otherwise, it will be copied internally and may later be freed by the caller.
+ *
+ * \p kind specifies the kind of distance as a OR'ed set of ::hwloc_distances_kind_e.
+ * Kind ::HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES will be automatically set
+ * according to objects having different types in hwloc_distances_add_values().
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return A hwloc_distances_add_handle_t that should then be passed
+ * to hwloc_distances_add_values() and hwloc_distances_add_commit().
+ *
+ * \return \c NULL on error.
+ */
+HWLOC_DECLSPEC hwloc_distances_add_handle_t
+hwloc_distances_add_create(hwloc_topology_t topology,
+                           const char *name, unsigned long kind,
+                           unsigned long flags);
+
+/** \brief Specify the objects and values in a new empty distances structure.
+ *
+ * Specify the objects and values for a new distances structure
+ * that was returned as a handle by hwloc_distances_add_create().
+ * The structure must then be committed with hwloc_distances_add_commit().
+ *
+ * The number of objects is \p nbobjs and the array of objects is \p objs.
+ * Distance values are stored as a one-dimension array in \p values.
+ * The distance from object i to object j is in slot i*nbobjs+j.
+ *
+ * \p nbobjs must be at least 2.
+ *
+ * Arrays \p objs and \p values will be copied internally,
+ * they may later be freed by the caller.
+ *
+ * On error, the temporary distances structure and its content are destroyed.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return \c 0 on success.
+ * \return \c -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_distances_add_values(hwloc_topology_t topology,
+                                              hwloc_distances_add_handle_t handle,
+                                              unsigned nbobjs, hwloc_obj_t *objs,
+                                              hwloc_uint64_t *values,
+                                              unsigned long flags);
+
+/** \brief Flags for adding a new distances to a topology. */
+enum hwloc_distances_add_flag_e {
+  /** \brief Try to group objects based on the newly provided distance information.
+   * This is ignored for distances between objects of different types.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP = (1UL<<0),
+  /** \brief If grouping, consider the distance values as inaccurate and relax the
+   * comparisons during the grouping algorithms. The actual accuracy may be modified
+   * through the HWLOC_GROUPING_ACCURACY environment variable (see \ref envvar).
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE = (1UL<<1)
+};
+
+/** \brief Commit a new distances structure.
+ *
+ * This function finalizes the distances structure and inserts in it the topology.
+ *
+ * Parameter \p handle was previously returned by hwloc_distances_add_create().
+ * Then objects and values were specified with hwloc_distances_add_values().
+ *
+ * \p flags configures the behavior of the function using an optional OR'ed set of
+ * ::hwloc_distances_add_flag_e.
+ * It may be used to request the grouping of existing objects based on distances.
+ *
+ * On error, the temporary distances structure and its content are destroyed.
+ *
+ * \return \c 0 on success.
+ * \return \c -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_distances_add_commit(hwloc_topology_t topology,
+                                              hwloc_distances_add_handle_t handle,
+                                              unsigned long flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_remove Remove distances between objects
+ * @{
+ */
+
+/** \brief Remove all distance matrices from a topology.
+ *
+ * Remove all distance matrices, either provided by the user or
+ * gathered through the OS.
+ *
+ * If these distances were used to group objects, these additional
+ * Group objects are not removed from the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove(hwloc_topology_t topology);
+
+/** \brief Remove distance matrices for objects at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth);
+
+/** \brief Remove distance matrices for objects of a specific type in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+static __hwloc_inline int
+hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return 0;
+  return hwloc_distances_remove_by_depth(topology, depth);
+}
+
+/** \brief Release and remove the given distance matrice from the topology.
+ *
+ * This function includes a call to hwloc_distances_release().
+ */
+HWLOC_DECLSPEC int hwloc_distances_release_remove(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DISTANCES_H */
diff --git a/deps/hwloc/include/hwloc/export.h b/deps/hwloc/include/hwloc/export.h
new file mode 100644
index 000000000..b178b77e5
--- /dev/null
+++ b/deps/hwloc/include/hwloc/export.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Exporting Topologies to XML or to Synthetic strings.
+ */
+
+#ifndef HWLOC_EXPORT_H
+#define HWLOC_EXPORT_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_xmlexport Exporting Topologies to XML
+ * @{
+ */
+
+/** \brief Flags for exporting XML topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_xml().
+ */
+enum hwloc_topology_export_xml_flags_e {
+ /** \brief Export XML that is loadable by hwloc v1.x.
+  * However, the export may miss some details about the topology.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 = (1UL<<0)
+};
+
+/** \brief Export the topology into an XML file.
+ *
+ * This file may be loaded later through hwloc_topology_set_xml().
+ *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported file may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ *
+ * \note If \p name is "-", the XML output is sent to the standard output.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath, unsigned long flags);
+
+/** \brief Export the topology into a newly-allocated XML memory buffer.
+ *
+ * \p xmlbuffer is allocated by the callee and should be freed with
+ * hwloc_free_xmlbuffer() later in the caller.
+ *
+ * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
+ *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported buffer may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags);
+
+/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
+HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
+
+/** \brief Set the application-specific callback for exporting object userdata
+ *
+ * The object userdata pointer is not exported to XML by default because hwloc
+ * does not know what it contains.
+ *
+ * This function lets applications set \p export_cb to a callback function
+ * that converts this opaque userdata into an exportable string.
+ *
+ * \p export_cb is invoked during XML export for each object whose
+ * \p userdata pointer is not \c NULL.
+ * The callback should use hwloc_export_obj_userdata() or
+ * hwloc_export_obj_userdata_base64() to actually export
+ * something to XML (possibly multiple times per object).
+ *
+ * \p export_cb may be set to \c NULL if userdata should not be exported to XML.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+								void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj));
+
+/** \brief Export some object userdata to XML
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ * It may be invoked one of multiple times to export some userdata to XML.
+ * The \p buffer content of length \p length is stored with optional name
+ * \p name.
+ *
+ * When importing this XML file, the import() callback (if set) will be
+ * called exactly as many times as hwloc_export_obj_userdata() was called
+ * during export(). It will receive the corresponding \p name, \p buffer
+ * and \p length arguments.
+ *
+ * \p reserved, \p topology and \p obj must be the first three parameters
+ * that were given to the export callback.
+ *
+ * Only printable characters may be exported to XML string attributes.
+ * If a non-printable character is passed in \p name or \p buffer,
+ * the function returns -1 with errno set to EINVAL.
+ *
+ * If exporting binary data, the application should first encode into
+ * printable characters only (or use hwloc_export_obj_userdata_base64()).
+ * It should also take care of portability issues if the export may
+ * be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Encode and export some object userdata to XML
+ *
+ * This function is similar to hwloc_export_obj_userdata() but it encodes
+ * the input buffer into printable characters before exporting.
+ * On import, decoding is automatically performed before the data is given
+ * to the import() callback if any.
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ *
+ * The function does not take care of portability issues if the export
+ * may be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Set the application-specific callback for importing userdata
+ *
+ * On XML import, userdata is ignored by default because hwloc does not know
+ * how to store it in memory.
+ *
+ * This function lets applications set \p import_cb to a callback function
+ * that will get the XML-stored userdata and store it in the object as expected
+ * by the application.
+ *
+ * \p import_cb is called during hwloc_topology_load() as many times as
+ * hwloc_export_obj_userdata() was called during export. The topology
+ * is not entirely setup yet. Object attributes are ready to consult,
+ * but links between objects are not.
+ *
+ * \p import_cb may be \c NULL if userdata should be ignored during import.
+ *
+ * \note \p buffer contains \p length characters followed by a null byte ('\0').
+ *
+ * \note This function should be called before hwloc_topology_load().
+ *
+ * \note The topology-specific userdata pointer is ignored when importing from XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+								void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length));
+
+/** @} */
+
+
+/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic
+ * @{
+ */
+
+/** \brief Flags for exporting synthetic topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic().
+ */
+enum hwloc_topology_export_synthetic_flags_e {
+ /** \brief Export extended types such as L2dcache as basic types such as Cache.
+  *
+  * This is required if loading the synthetic description with hwloc < 1.9.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0),
+
+ /** \brief Do not export level attributes.
+  *
+  * Ignore level attributes such as memory/cache sizes or PU indexes.
+  * This is required if loading the synthetic description with hwloc < 1.10.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1),
+
+ /** \brief Export the memory hierarchy as expected in hwloc 1.x.
+  *
+  * Instead of attaching memory children to levels, export single NUMA node child
+  * as normal intermediate levels, when possible.
+  * This is required if loading the synthetic description with hwloc 1.x.
+  * However this may fail if some objects have multiple local NUMA nodes.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 = (1UL<<2),
+
+ /** \brief Do not export memory information.
+  *
+  * Only export the actual hierarchy of normal CPU-side objects and ignore
+  * where memory is attached.
+  * This is useful for when the hierarchy of CPUs is what really matters,
+  * but it behaves as if there was a single machine-wide NUMA node.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY = (1UL<<3)
+};
+
+/** \brief Export the topology as a synthetic string.
+ *
+ * At most \p buflen characters will be written in \p buffer,
+ * including the terminating \0.
+ *
+ * This exported string may be given back to hwloc_topology_set_synthetic().
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_synthetic_flags_e.
+ *
+ * \return The number of characters that were written,
+ * not including the terminating \0.
+ *
+ * \return -1 if the topology could not be exported,
+ * for instance if it is not symmetric.
+ *
+ * \note I/O and Misc children are ignored, the synthetic string only
+ * describes normal children.
+ *
+ * \note A 1024-byte buffer should be large enough for exporting
+ * topologies in the vast majority of cases.
+ */
+  HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_EXPORT_H */
diff --git a/deps/hwloc/include/hwloc/helper.h b/deps/hwloc/include/hwloc/helper.h
new file mode 100644
index 000000000..f918d8163
--- /dev/null
+++ b/deps/hwloc/include/hwloc/helper.h
@@ -0,0 +1,1231 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2021 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief High-level hwloc traversal helpers.
+ */
+
+#ifndef HWLOC_HELPER_H
+#define HWLOC_HELPER_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
+ * @{
+ */
+
+/** \brief Get the first largest object included in the given cpuset \p set.
+ *
+ * \return the first object that is included in \p set and whose parent is not.
+ *
+ * This is convenient for iterating over all largest objects within a CPU set
+ * by doing a loop getting the first largest object and clearing its CPU set
+ * from the remaining CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  if (!hwloc_bitmap_intersects(obj->cpuset, set))
+    return NULL;
+  while (!hwloc_bitmap_isincluded(obj->cpuset, set)) {
+    /* while the object intersects without being included, look at its children */
+    hwloc_obj_t child = obj->first_child;
+    while (child) {
+      if (hwloc_bitmap_intersects(child->cpuset, set))
+	break;
+      child = child->next_sibling;
+    }
+    if (!child)
+      /* no child intersects, return their father */
+      return obj;
+    /* found one intersecting child, look at its children */
+    obj = child;
+  }
+  /* obj is included, return it */
+  return obj;
+}
+
+/** \brief Get the set of largest objects covering exactly a given cpuset \p set
+ *
+ * \return the number of objects returned in \p objs.
+ */
+HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+						 hwloc_obj_t * __hwloc_restrict objs, int max);
+
+/** \brief Return the next object at depth \p depth included in CPU set \p set.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth
+ * included in \p set.  The next invokation should pass the previous
+ * return value in \p prev so as to obtain the next object in \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   int depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && (hwloc_bitmap_iszero(next->cpuset) || !hwloc_bitmap_isincluded(next->cpuset, set)))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Return the next object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_next_obj_inside_cpuset_by_depth().
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					  hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      int depth, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      int depth, unsigned idx)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return NULL;
+  while (obj) {
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) {
+      if (count == idx)
+	return obj;
+      count++;
+    }
+    obj = obj->next_cousin;
+  }
+  return NULL;
+}
+
+/** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_obj_inside_cpuset_by_depth().
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx);
+}
+
+/** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 int depth) __hwloc_attribute_pure;
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 int depth)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return 0;
+  while (obj) {
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
+      count++;
+    obj = obj->next_cousin;
+  }
+  return count;
+}
+
+/** \brief Return the number of objects of type \p type included in CPU set \p set.
+ *
+ * If no object for that type exists inside CPU set \p set, 0 is
+ * returned.  If there are several levels with objects of that type
+ * inside CPU set \p set, -1 is returned.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return (int) hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+}
+
+/** \brief Return the logical index among the objects included in CPU set \p set.
+ *
+ * Consult all objects in the same level as \p obj and inside CPU set \p set
+ * in the logical order, and return the index of \p obj within them.
+ * If \p set covers the entire topology, this is the logical index of \p obj.
+ * Otherwise, this is similar to a logical index within the part of the topology
+ * defined by CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
+ * \note This function cannot work if obj does not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj)
+{
+  int idx = 0;
+  if (!hwloc_bitmap_isincluded(obj->cpuset, set))
+    return -1;
+  /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
+  while ((obj = obj->prev_cousin) != NULL)
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
+      idx++;
+  return idx;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set
+ * @{
+ */
+
+/** \brief Get the child covering at least CPU set \p set.
+ *
+ * \return \c NULL if no child matches or if \p set is empty.
+ *
+ * \note This function cannot work if parent does not have a CPU set (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent)
+{
+  hwloc_obj_t child;
+  if (hwloc_bitmap_iszero(set))
+    return NULL;
+  child = parent->first_child;
+  while (child) {
+    if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset))
+      return child;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/** \brief Get the lowest object covering at least CPU set \p set
+ *
+ * \return \c NULL if no object matches or if \p set is empty.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  struct hwloc_obj *current = hwloc_get_root_obj(topology);
+  if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset))
+    return NULL;
+  while (1) {
+    hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current);
+    if (!child)
+      return current;
+    current = child;
+  }
+}
+
+/** \brief Iterate through same-depth objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object at depth \p
+ * depth covering at least part of CPU set \p set.  The next
+ * invokation should pass the previous return value in \p prev so as
+ * to obtain the next object covering at least another part of \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					    int depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_intersects(set, next->cpuset))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Iterate through same-type objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object of type \p
+ * type covering at least part of CPU set \p set.  The next invokation
+ * should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of
+ * \p set.
+ *
+ * If there are no or multiple depths for type \p type, \c NULL is returned.
+ * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
+ * for each depth.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the ancestor object of \p obj at depth \p depth.
+ *
+ * \note \p depth should not be the depth of PU or NUMA objects
+ * since they are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate level depth,
+ * such as the depth of Packages, Cores, or Caches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj;
+  if (obj->depth < depth)
+    return NULL;
+  while (ancestor && ancestor->depth > depth)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the ancestor object of \p obj with type \p type.
+ *
+ * \note \p type should not be ::HWLOC_OBJ_PU or ::HWLOC_OBJ_NUMANODE
+ * since these objects are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate object type,
+ * such as ::HWLOC_OBJ_PACKAGE, ::HWLOC_OBJ_CORE, etc.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj->parent;
+  while (ancestor && ancestor->type != type)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the common parent object to objects \p obj1 and \p obj2 */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* the loop isn't so easy since intermediate ancestors may have
+   * different depth, causing us to alternate between using obj1->parent
+   * and obj2->parent. Also, even if at some point we find ancestors of
+   * of the same depth, their ancestors may have different depth again.
+   */
+  while (obj1 != obj2) {
+    while (obj1->depth > obj2->depth)
+      obj1 = obj1->parent;
+    while (obj2->depth > obj1->depth)
+      obj2 = obj2->parent;
+    if (obj1 != obj2 && obj1->depth == obj2->depth) {
+      obj1 = obj1->parent;
+      obj2 = obj2->parent;
+    }
+  }
+  return obj1;
+}
+
+/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \note This function cannot work if \p obj and \p subtree_root objects do
+ * not have CPU sets (I/O or Misc objects).
+ */
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root)
+{
+  return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset);
+}
+
+/** \brief Return the next child.
+ *
+ * Return the next child among the normal children list,
+ * then among the memory children list, then among the I/O
+ * children list, then among the Misc children list.
+ *
+ * If \p prev is \c NULL, return the first child.
+ *
+ * Return \c NULL when there is no next child.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
+{
+  hwloc_obj_t obj;
+  int state = 0;
+  if (prev) {
+    if (prev->type == HWLOC_OBJ_MISC)
+      state = 3;
+    else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+      state = 2;
+    else if (prev->type == HWLOC_OBJ_NUMANODE)
+      state = 1;
+    obj = prev->next_sibling;
+  } else {
+    obj = parent->first_child;
+  }
+  if (!obj && state == 0) {
+    obj = parent->memory_first_child;
+    state = 1;
+  }
+  if (!obj && state == 1) {
+    obj = parent->io_first_child;
+    state = 2;
+  }
+  if (!obj && state == 2) {
+    obj = parent->misc_first_child;
+    state = 3;
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_types Kinds of object Type
+ * @{
+ *
+ * Each object type is
+ * either Normal (i.e. hwloc_obj_type_is_normal() returns 1),
+ * or Memory (i.e. hwloc_obj_type_is_memory() returns 1)
+ * or I/O (i.e. hwloc_obj_type_is_io() returns 1)
+ * or Misc (i.e. equal to ::HWLOC_OBJ_MISC).
+ * It cannot be of more than one of these kinds.
+ */
+
+/** \brief Check whether an object type is Normal.
+ *
+ * Normal objects are objects of the main CPU hierarchy
+ * (Machine, Package, Core, PU, CPU caches, etc.),
+ * but they are not NUMA nodes, I/O devices or Misc objects.
+ *
+ * They are attached to parent as Normal children,
+ * not as Memory, I/O or Misc children.
+ *
+ * \return 1 if an object of type \p type is a Normal object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is I/O.
+ *
+ * I/O objects are objects attached to their parents
+ * in the I/O children list.
+ * This current includes Bridges, PCI and OS devices.
+ *
+ * \return 1 if an object of type \p type is a I/O object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_io(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is Memory.
+ *
+ * Memory objects are objects attached to their parents
+ * in the Memory children list.
+ * This current includes NUMA nodes and Memory-side caches.
+ *
+ * \return 1 if an object of type \p type is a Memory object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Cache (Data, Unified or Instruction).
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Data or Unified Cache.
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Data or Unified Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Instruction Cache,
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Instruction Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
+ * @{
+ */
+
+/** \brief Find the depth of cache objects matching cache level and type.
+ *
+ * Return the depth of the topology level that contains cache objects
+ * whose attributes match \p cachelevel and \p cachetype.
+
+ * This function is identical to calling hwloc_get_type_depth() with the
+ * corresponding type such as ::HWLOC_OBJ_L1ICACHE, except that it may
+ * also return a Unified cache when looking for an instruction cache.
+ *
+ * If no cache level matches, ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * unique matching unified cache level is returned.
+ *
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION,
+ * either a matching cache, or a unified cache is returned.
+ *
+ * If \p cachetype is \c -1, it is ignored and multiple levels may
+ * match. The function returns either the depth of a uniquely matching
+ * level or ::HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_cache_type_depth (hwloc_topology_t topology,
+			    unsigned cachelevel, hwloc_obj_cache_type_t cachetype)
+{
+  int depth;
+  int found = HWLOC_TYPE_DEPTH_UNKNOWN;
+  for (depth=0; ; depth++) {
+    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    if (!obj)
+      break;
+    if (!hwloc_obj_type_is_dcache(obj->type) || obj->attr->cache.depth != cachelevel)
+      /* doesn't match, try next depth */
+      continue;
+    if (cachetype == (hwloc_obj_cache_type_t) -1) {
+      if (found != HWLOC_TYPE_DEPTH_UNKNOWN) {
+	/* second match, return MULTIPLE */
+        return HWLOC_TYPE_DEPTH_MULTIPLE;
+      }
+      /* first match, mark it as found */
+      found = depth;
+      continue;
+    }
+    if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED)
+      /* exact match (either unified is alone, or we match instruction or data), return immediately */
+      return depth;
+  }
+  /* went to the bottom, return what we found */
+  return found;
+}
+
+/** \brief Get the first data (or unified) cache covering a cpuset \p set
+ *
+ * \return \c NULL if no cache matches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
+  while (current) {
+    if (hwloc_obj_type_is_dcache(current->type))
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the first data (or unified) cache shared between an object and somebody else.
+ *
+ * \return \c NULL if no cache matches or if an invalid object is given.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t current = obj->parent;
+  if (!obj->cpuset)
+    return NULL;
+  while (current) {
+    if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
+        && hwloc_obj_type_is_dcache(current->type))
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Remove simultaneous multithreading PUs from a CPU set.
+ *
+ * For each core in \p topology, if \p cpuset contains some PUs of that core,
+ * modify \p cpuset to only keep a single PU for that core.
+ *
+ * \p which specifies which PU will be kept.
+ * PU are considered in physical index order.
+ * If 0, for each core, the function keeps the first PU that was originally set in \p cpuset.
+ *
+ * If \p which is larger than the number of PUs in a core there were originally set in \p cpuset,
+ * no PU is kept for that core.
+ *
+ * \note PUs that are not below a Core object are ignored
+ * (for instance if the topology does not contain any Core object).
+ * None of them is removed from \p cpuset.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_singlify_per_core(hwloc_topology_t topology, hwloc_bitmap_t cpuset, unsigned which);
+
+/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index.
+ *
+ * This function is useful for converting a CPU set into the PU
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_cpubind()),
+ * one may iterate over the bits of the resulting CPU set with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index.
+ *
+ * This function is useful for converting a nodeset into the NUMA node
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_membind() with HWLOC_MEMBIND_BYNODESET),
+ * one may iterate over the bits of the resulting nodeset with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Do a depth-first traversal of the topology to find and sort
+ *
+ * all objects that are at the same depth than \p src.
+ * Report in \p objs up to \p max physically closest ones to \p src.
+ *
+ * \return the number of objects returned in \p objs.
+ *
+ * \return 0 if \p src is an I/O object.
+ *
+ * \note This function requires the \p src object to have a CPU set.
+ */
+/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */
+HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max);
+
+/** \brief Find an object below another object, both specified by types and indexes.
+ *
+ * Start from the top system object and find object of type \p type1
+ * and logical index \p idx1.  Then look below this object and find another
+ * object of type \p type2 and logical index \p idx2.  Indexes are specified
+ * within the parent, not withing the entire system.
+ *
+ * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
+ * is 3, return the fourth core object below the third package.
+ *
+ * \note This function requires these objects to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2)
+{
+  hwloc_obj_t obj;
+  obj = hwloc_get_obj_by_type (topology, type1, idx1);
+  if (!obj)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2);
+}
+
+/** \brief Find an object below a chain of objects specified by types and indexes.
+ *
+ * This is a generalized version of hwloc_get_obj_below_by_type().
+ *
+ * Arrays \p typev and \p idxv must contain \p nr types and indexes.
+ *
+ * Start from the top system object and walk the arrays \p typev and \p idxv.
+ * For each type and logical index couple in the arrays, look under the previously found
+ * object to find the index-th object of the given type.
+ * Indexes are specified within the parent, not withing the entire system.
+ *
+ * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE,
+ * and idxv contains 0, 1 and 2, return the third core object below
+ * the second package below the first NUMA node.
+ *
+ * \note This function requires all these objects and the root object
+ * to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  int i;
+  for(i=0; i<nr; i++) {
+    if (!obj)
+      return NULL;
+    obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]);
+  }
+  return obj;
+}
+
+/** \brief Return an object of a different type with same locality.
+ *
+ * If the source object \p src is a normal or memory type,
+ * this function returns an object of type \p type with same
+ * CPU and node sets, either below or above in the hierarchy.
+ *
+ * If the source object \p src is a PCI or an OS device within a PCI
+ * device, the function may either return that PCI device, or another
+ * OS device in the same PCI parent.
+ * This may for instance be useful for converting between OS devices
+ * such as "nvml0" or "rsmi1" used in distance structures into the
+ * the PCI device, or the CUDA or OpenCL OS device that correspond
+ * to the same physical card.
+ *
+ * If not \c NULL, parameter \p subtype only select objects whose
+ * subtype attribute exists and is \p subtype (case-insensitively),
+ * for instance "OpenCL" or "CUDA".
+ *
+ * If not \c NULL, parameter \p nameprefix only selects objects whose
+ * name attribute exists and starts with \p nameprefix (case-insensitively),
+ * for instance "rsmi" for matching "rsmi0".
+ *
+ * If multiple objects match, the first one is returned.
+ *
+ * This function will not walk the hierarchy across bridges since
+ * the PCI locality may become different.
+ * This function cannot also convert between normal/memory objects
+ * and I/O or Misc objects.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \return An object with identical locality,
+ * matching \p subtype and \p nameprefix if any.
+ *
+ * \return \c NULL if no matching object could be found,
+ * or if the source object and target type are incompatible,
+ * for instance if converting between CPU and I/O objects.
+ */
+HWLOC_DECLSPEC hwloc_obj_t
+hwloc_get_obj_with_same_locality(hwloc_topology_t topology, hwloc_obj_t src,
+                                 hwloc_obj_type_t type, const char *subtype, const char *nameprefix,
+                                 unsigned long flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_distribute Distributing items over a topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_distrib().
+ */
+enum hwloc_distrib_flags_e {
+  /** \brief Distrib in reverse order, starting from the last objects.
+   * \hideinitializer
+   */
+  HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0)
+};
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * Array \p set will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under objects \p roots, down to depth \p until
+ * (which can be INT_MAX to distribute down to the finest level).
+ *
+ * \p n_roots is usually 1 and \p roots only contains the topology root object
+ * so as to distribute over the entire topology.
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ *
+ * \note This function replaces the now deprecated hwloc_distribute()
+ * and hwloc_distributev() functions.
+ */
+static __hwloc_inline int
+hwloc_distrib(hwloc_topology_t topology,
+	      hwloc_obj_t *roots, unsigned n_roots,
+	      hwloc_cpuset_t *set,
+	      unsigned n,
+	      int until, unsigned long flags)
+{
+  unsigned i;
+  unsigned tot_weight;
+  unsigned given, givenweight;
+  hwloc_cpuset_t *cpusetp = set;
+
+  if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tot_weight = 0;
+  for (i = 0; i < n_roots; i++)
+    tot_weight += (unsigned) hwloc_bitmap_weight(roots[i]->cpuset);
+
+  for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
+    unsigned chunk, weight;
+    hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
+    hwloc_cpuset_t cpuset = root->cpuset;
+    while (!hwloc_obj_type_is_normal(root->type))
+      /* If memory/io/misc, walk up to normal parent */
+      root = root->parent;
+    weight = (unsigned) hwloc_bitmap_weight(cpuset);
+    if (!weight)
+      continue;
+    /* Give to root a chunk proportional to its weight.
+     * If previous chunks got rounded-up, we may get a bit less. */
+    chunk = (( (givenweight+weight) * n  + tot_weight-1) / tot_weight)
+          - ((  givenweight         * n  + tot_weight-1) / tot_weight);
+    if (!root->arity || chunk <= 1 || root->depth >= until) {
+      /* We can't split any more, put everything there.  */
+      if (chunk) {
+	/* Fill cpusets with ours */
+	unsigned j;
+	for (j=0; j < chunk; j++)
+	  cpusetp[j] = hwloc_bitmap_dup(cpuset);
+      } else {
+	/* We got no chunk, just merge our cpuset to a previous one
+	 * (the first chunk cannot be empty)
+	 * so that this root doesn't get ignored.
+	 */
+	assert(given);
+	hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset);
+      }
+    } else {
+      /* Still more to distribute, recurse into children */
+      hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags);
+    }
+    cpusetp += chunk;
+    given += chunk;
+    givenweight += weight;
+  }
+
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
+ * @{
+ */
+
+/** \brief Get complete CPU set
+ *
+ * \return the complete CPU set of processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get topology CPU set
+ *
+ * \return the CPU set of processors of the system for which hwloc
+ * provides topology information. This is equivalent to the cpuset of the
+ * system object.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object CPU-set.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get allowed CPU set
+ *
+ * \return the CPU set of allowed processors of the system.
+ *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
+ * this is identical to hwloc_topology_get_topology_cpuset(), which means
+ * all PUs are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * cpuset checks whether there are allowed PUs inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed PUs.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+HWLOC_DECLSPEC hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get complete node set
+ *
+ * \return the complete node set of memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete nodeset.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get topology node set
+ *
+ * \return the node set of memory of the system for which hwloc
+ * provides topology information. This is equivalent to the nodeset of the
+ * system object.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object nodeset.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Get allowed node set
+ *
+ * \return the node set of allowed memory of the system.
+ *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
+ * this is identical to hwloc_topology_get_topology_nodeset(), which means
+ * all NUMA nodes are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * nodeset checks whether there are allowed NUMA nodes inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed NUMA nodes.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+HWLOC_DECLSPEC hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
+ *
+ * @{
+ */
+
+/** \brief Convert a CPU set into a NUMA node set
+ *
+ * For each PU included in the input \p _cpuset, set the corresponding
+ * local NUMA node(s) in the output \p nodeset.
+ *
+ * If some NUMA nodes have no CPUs at all, this function never sets their
+ * indexes in the output node set, even if a full CPU set is given in input.
+ *
+ * Hence the entire topology CPU set is converted into the set of all nodes
+ * that have some local CPUs.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		if (hwloc_bitmap_set(nodeset, obj->os_index) < 0)
+			return -1;
+	return 0;
+}
+
+/** \brief Convert a NUMA node set into a CPU set
+ *
+ * For each NUMA node included in the input \p nodeset, set the corresponding
+ * local PUs in the output \p _cpuset.
+ *
+ * If some CPUs have no local NUMA nodes, this function never sets their
+ * indexes in the output CPU set, even if a full node set is given in input.
+ *
+ * Hence the entire topology node set is converted into the set of all CPUs
+ * that have some local NUMA nodes.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			if (hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset) < 0)
+				return -1;
+	}
+	return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_advanced_io Finding I/O objects
+ * @{
+ */
+
+/** \brief Get the first non-I/O ancestor object.
+ *
+ * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
+ * object. This object (normal or memory) may then be used for binding
+ * because it has non-NULL CPU and node sets
+ * and because its locality is the same as \p ioobj.
+ *
+ * \note The resulting object is usually a normal object but it could also
+ * be a memory object (e.g. NUMA node) in future platforms if I/O objects
+ * ever get attached to memory instead of CPUs.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t ioobj)
+{
+  hwloc_obj_t obj = ioobj;
+  while (obj && !obj->cpuset) {
+    obj = obj->parent;
+  }
+  return obj;
+}
+
+/** \brief Get the next PCI device in the system.
+ *
+ * \return the first PCI device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given domain, bus device and function PCI bus id.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
+			  unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
+    if (obj->attr->pcidev.domain == domain
+	&& obj->attr->pcidev.bus == bus
+	&& obj->attr->pcidev.dev == dev
+	&& obj->attr->pcidev.func == func)
+      return obj;
+  }
+  return NULL;
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given as a string xxxx:yy:zz.t or yy:zz.t.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
+{
+  unsigned domain = 0; /* default */
+  unsigned bus, dev, func;
+
+  if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3
+      && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func);
+}
+
+/** \brief Get the next OS device in the system.
+ *
+ * \return the first OS device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev);
+}
+
+/** \brief Get the next bridge in the system.
+ *
+ * \return the first bridge if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev);
+}
+
+/* \brief Checks whether a given bridge covers a given PCI bus.
+ */
+static __hwloc_inline int
+hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
+			   unsigned domain, unsigned bus)
+{
+  return bridge->type == HWLOC_OBJ_BRIDGE
+    && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+    && bridge->attr->bridge.downstream.pci.domain == domain
+    && bridge->attr->bridge.downstream.pci.secondary_bus <= bus
+    && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
+}
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/deps/hwloc/include/hwloc/inlines.h b/deps/hwloc/include/hwloc/inlines.h
new file mode 100644
index 000000000..494209ea6
--- /dev/null
+++ b/deps/hwloc/include/hwloc/inlines.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2018 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_INLINES_H
+#define HWLOC_INLINES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the highest existing level with type order >= */
+  for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
+      return depth+1;
+
+  /* Shouldn't ever happen, as there is always a Machine level with lower order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the lowest existing level with type order <= */
+  for(depth = 0; ; depth++)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0)
+      return depth-1;
+
+  /* Shouldn't ever happen, as there is always a PU level with higher order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return (int) hwloc_get_nbobjs_by_depth(topology, depth);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return NULL;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_by_depth(topology, depth, idx);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev)
+{
+  if (!prev)
+    return hwloc_get_obj_by_depth (topology, depth, 0);
+  if (prev->depth != depth)
+    return NULL;
+  return prev->next_cousin;
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_by_depth (topology, depth, prev);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology)
+{
+  return hwloc_get_obj_by_depth (topology, 0, 0);
+}
+
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
+{
+  unsigned i;
+  for(i=0; i<obj->infos_count; i++) {
+    struct hwloc_info_s *info = &obj->infos[i];
+    if (!strcmp(info->name, name))
+      return info->value;
+  }
+  return NULL;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
+  if (p)
+    return p;
+
+  if (hwloc_set_membind(topology, set, policy, flags) < 0)
+    /* hwloc_set_membind() takes care of ignoring errors if non-STRICT */
+    return NULL;
+
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/deps/hwloc/include/hwloc/memattrs.h b/deps/hwloc/include/hwloc/memattrs.h
new file mode 100644
index 000000000..2494abb08
--- /dev/null
+++ b/deps/hwloc/include/hwloc/memattrs.h
@@ -0,0 +1,455 @@
+/*
+ * Copyright © 2019-2020 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Memory node attributes.
+ */
+
+#ifndef HWLOC_MEMATTR_H
+#define HWLOC_MEMATTR_H
+
+#include "hwloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+/** \defgroup hwlocality_memattrs Comparing memory node attributes for finding where to allocate on
+ *
+ * Platforms with heterogeneous memory require ways to decide whether
+ * a buffer should be allocated on "fast" memory (such as HBM),
+ * "normal" memory (DDR) or even "slow" but large-capacity memory
+ * (non-volatile memory).
+ * These memory nodes are called "Targets" while the CPU accessing them
+ * is called the "Initiator". Access performance depends on their
+ * locality (NUMA platforms) as well as the intrinsic performance
+ * of the targets (heterogeneous platforms).
+ *
+ * The following attributes describe the performance of memory accesses
+ * from an Initiator to a memory Target, for instance their latency
+ * or bandwidth.
+ * Initiators performing these memory accesses are usually some PUs or Cores
+ * (described as a CPU set).
+ * Hence a Core may choose where to allocate a memory buffer by comparing
+ * the attributes of different target memory nodes nearby.
+ *
+ * There are also some attributes that are system-wide.
+ * Their value does not depend on a specific initiator performing
+ * an access.
+ * The memory node Capacity is an example of such attribute without
+ * initiator.
+ *
+ * One way to use this API is to start with a cpuset describing the Cores where
+ * a program is bound. The best target NUMA node for allocating memory in this
+ * program on these Cores may be obtained by passing this cpuset as an initiator
+ * to hwloc_memattr_get_best_target() with the relevant memory attribute.
+ * For instance, if the code is latency limited, use the Latency attribute.
+ *
+ * A more flexible approach consists in getting the list of local NUMA nodes
+ * by passing this cpuset to hwloc_get_local_numanode_objs().
+ * Attribute values for these nodes, if any, may then be obtained with
+ * hwloc_memattr_get_value() and manually compared with the desired criteria.
+ *
+ * \note The API also supports specific objects as initiator,
+ * but it is currently not used internally by hwloc.
+ * Users may for instance use it to provide custom performance
+ * values for host memory accesses performed by GPUs.
+ *
+ * \note The interface actually also accepts targets that are not NUMA nodes.
+ * @{
+ */
+
+/** \brief Memory node attributes. */
+enum hwloc_memattr_id_e {
+  /** \brief "Capacity".
+   * The capacity is returned in bytes
+   * (local_memory attribute in objects).
+   *
+   * Best capacity nodes are nodes with <b>higher capacity</b>.
+   *
+   * No initiator is involved when looking at this attribute.
+   * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST.
+   */
+  HWLOC_MEMATTR_ID_CAPACITY = 0,
+
+  /** \brief "Locality".
+   * The locality is returned as the number of PUs in that locality
+   * (e.g. the weight of its cpuset).
+   *
+   * Best locality nodes are nodes with <b>smaller locality</b>
+   * (nodes that are local to very few PUs).
+   * Poor locality nodes are nodes with larger locality
+   * (nodes that are local to the entire machine).
+   *
+   * No initiator is involved when looking at this attribute.
+   * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST.
+   */
+  HWLOC_MEMATTR_ID_LOCALITY = 1,
+
+  /** \brief "Bandwidth".
+   * The bandwidth is returned in MiB/s, as seen from the given initiator location.
+   * Best bandwidth nodes are nodes with <b>higher bandwidth</b>.
+   * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST
+   * and ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR.
+   */
+  HWLOC_MEMATTR_ID_BANDWIDTH = 2,
+
+  /** \brief "Latency".
+   * The latency is returned as nanoseconds, as seen from the given initiator location.
+   * Best latency nodes are nodes with <b>smaller latency</b>.
+   * The corresponding attribute flags are ::HWLOC_MEMATTR_FLAG_LOWER_FIRST
+   * and ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR.
+   */
+  HWLOC_MEMATTR_ID_LATENCY = 3
+
+  /* TODO read vs write, persistence? */
+};
+
+/** \brief A memory attribute identifier.
+ * May be either one of ::hwloc_memattr_id_e or a new id returned by hwloc_memattr_register().
+ */
+typedef unsigned hwloc_memattr_id_t;
+
+/** \brief Return the identifier of the memory attribute with the given name.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_by_name(hwloc_topology_t topology,
+                          const char *name,
+                          hwloc_memattr_id_t *id);
+
+
+/** \brief Type of location. */
+enum hwloc_location_type_e {
+  /** \brief Location is given as a cpuset, in the location cpuset union field. \hideinitializer */
+  HWLOC_LOCATION_TYPE_CPUSET = 1,
+  /** \brief Location is given as an object, in the location object union field. \hideinitializer */
+  HWLOC_LOCATION_TYPE_OBJECT = 0
+};
+
+/** \brief Where to measure attributes from. */
+struct hwloc_location {
+  /** \brief Type of location. */
+  enum hwloc_location_type_e type;
+  /** \brief Actual location. */
+  union hwloc_location_u {
+    /** \brief Location as a cpuset, when the location type is ::HWLOC_LOCATION_TYPE_CPUSET. */
+    hwloc_cpuset_t cpuset;
+    /** \brief Location as an object, when the location type is ::HWLOC_LOCATION_TYPE_OBJECT. */
+    hwloc_obj_t object;
+  } location;
+};
+
+
+/** \brief Flags for selecting target NUMA nodes. */
+enum hwloc_local_numanode_flag_e {
+  /** \brief Select NUMA nodes whose locality is larger than the given cpuset.
+   * For instance, if a single PU (or its cpuset) is given in \p initiator,
+   * select all nodes close to the package that contains this PU.
+   * \hideinitializer
+   */
+  HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY = (1UL<<0),
+
+  /** \brief Select NUMA nodes whose locality is smaller than the given cpuset.
+   * For instance, if a package (or its cpuset) is given in \p initiator,
+   * also select nodes that are attached to only a half of that package.
+   * \hideinitializer
+   */
+  HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY = (1UL<<1),
+
+  /** \brief Select all NUMA nodes in the topology.
+   * The initiator \p initiator is ignored.
+   * \hideinitializer
+   */
+  HWLOC_LOCAL_NUMANODE_FLAG_ALL = (1UL<<2)
+};
+
+/** \brief Return an array of local NUMA nodes.
+ *
+ * By default only select the NUMA nodes whose locality is exactly
+ * the given \p location. More nodes may be selected if additional flags
+ * are given as a OR'ed set of ::hwloc_local_numanode_flag_e.
+ *
+ * If \p location is given as an explicit object, its CPU set is used
+ * to find NUMA nodes with the corresponding locality.
+ * If the object does not have a CPU set (e.g. I/O object), the CPU
+ * parent (where the I/O object is attached) is used.
+ *
+ * On input, \p nr points to the number of nodes that may be stored
+ * in the \p nodes array.
+ * On output, \p nr will be changed to the number of stored nodes,
+ * or the number of nodes that would have been stored if there were
+ * enough room.
+ *
+ * \note Some of these NUMA nodes may not have any memory attribute
+ * values and hence not be reported as actual targets in other functions.
+ *
+ * \note The number of NUMA nodes in the topology (obtained by
+ * hwloc_bitmap_weight() on the root object nodeset) may be used
+ * to allocate the \p nodes array.
+ *
+ * \note When an object CPU set is given as locality, for instance a Package,
+ * and when flags contain both ::HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY
+ * and ::HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY,
+ * the returned array corresponds to the nodeset of that object.
+ */
+HWLOC_DECLSPEC int
+hwloc_get_local_numanode_objs(hwloc_topology_t topology,
+                              struct hwloc_location *location,
+                              unsigned *nr,
+                              hwloc_obj_t *nodes,
+                              unsigned long flags);
+
+
+
+/** \brief Return an attribute value for a specific target NUMA node.
+ *
+ * If the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
+ * location \p initiator is ignored and may be \c NULL.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
+ * when refering to accesses performed by CPU cores.
+ * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
+ * but users may for instance use it to provide custom information about
+ * host memory accesses performed by GPUs.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_value(hwloc_topology_t topology,
+                        hwloc_memattr_id_t attribute,
+                        hwloc_obj_t target_node,
+                        struct hwloc_location *initiator,
+                        unsigned long flags,
+                        hwloc_uint64_t *value);
+
+/** \brief Return the best target NUMA node for the given attribute and initiator.
+ *
+ * If the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
+ * location \p initiator is ignored and may be \c NULL.
+ *
+ * If \p value is non \c NULL, the corresponding value is returned there.
+ *
+ * If multiple targets have the same attribute values, only one is
+ * returned (and there is no way to clarify how that one is chosen).
+ * Applications that want to detect targets with identical/similar
+ * values, or that want to look at values for multiple attributes,
+ * should rather get all values using hwloc_memattr_get_value()
+ * and manually select the target they consider the best.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * If there are no matching targets, \c -1 is returned with \p errno set to \c ENOENT;
+ *
+ * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
+ * when refering to accesses performed by CPU cores.
+ * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
+ * but users may for instance use it to provide custom information about
+ * host memory accesses performed by GPUs.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_best_target(hwloc_topology_t topology,
+                              hwloc_memattr_id_t attribute,
+                              struct hwloc_location *initiator,
+                              unsigned long flags,
+                              hwloc_obj_t *best_target, hwloc_uint64_t *value);
+
+/** \brief Return the best initiator for the given attribute and target NUMA node.
+ *
+ * If the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
+ * \c -1 is returned and \p errno is set to \c EINVAL.
+ *
+ * If \p value is non \c NULL, the corresponding value is returned there.
+ *
+ * If multiple initiators have the same attribute values, only one is
+ * returned (and there is no way to clarify how that one is chosen).
+ * Applications that want to detect initiators with identical/similar
+ * values, or that want to look at values for multiple attributes,
+ * should rather get all values using hwloc_memattr_get_value()
+ * and manually select the initiator they consider the best.
+ *
+ * The returned initiator should not be modified or freed,
+ * it belongs to the topology.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * If there are no matching initiators, \c -1 is returned with \p errno set to \c ENOENT;
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_best_initiator(hwloc_topology_t topology,
+                                 hwloc_memattr_id_t attribute,
+                                 hwloc_obj_t target,
+                                 unsigned long flags,
+                                 struct hwloc_location *best_initiator, hwloc_uint64_t *value);
+
+/** @} */
+
+
+/** \defgroup hwlocality_memattrs_manage Managing memory attributes
+ * @{
+ */
+
+/** \brief Return the name of a memory attribute.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_name(hwloc_topology_t topology,
+                       hwloc_memattr_id_t attribute,
+                       const char **name);
+
+/** \brief Return the flags of the given attribute.
+ *
+ * Flags are a OR'ed set of ::hwloc_memattr_flag_e.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_flags(hwloc_topology_t topology,
+                        hwloc_memattr_id_t attribute,
+                        unsigned long *flags);
+
+/** \brief Memory attribute flags.
+ * Given to hwloc_memattr_register() and returned by hwloc_memattr_get_flags().
+ */
+enum hwloc_memattr_flag_e {
+  /** \brief The best nodes for this memory attribute are those with the higher values.
+   * For instance Bandwidth.
+   */
+  HWLOC_MEMATTR_FLAG_HIGHER_FIRST = (1UL<<0),
+  /** \brief The best nodes for this memory attribute are those with the lower values.
+   * For instance Latency.
+   */
+  HWLOC_MEMATTR_FLAG_LOWER_FIRST = (1UL<<1),
+  /** \brief The value returned for this memory attribute depends on the given initiator.
+   * For instance Bandwidth and Latency, but not Capacity.
+   */
+  HWLOC_MEMATTR_FLAG_NEED_INITIATOR = (1UL<<2)
+};
+
+/** \brief Register a new memory attribute.
+ *
+ * Add a specific memory attribute that is not defined in ::hwloc_memattr_id_e.
+ * Flags are a OR'ed set of ::hwloc_memattr_flag_e. It must contain at least
+ * one of ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST or ::HWLOC_MEMATTR_FLAG_LOWER_FIRST.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_register(hwloc_topology_t topology,
+                       const char *name,
+                       unsigned long flags,
+                       hwloc_memattr_id_t *id);
+
+/** \brief Set an attribute value for a specific target NUMA node.
+ *
+ * If the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
+ * location \p initiator is ignored and may be \c NULL.
+ *
+ * The initiator will be copied into the topology,
+ * the caller should free anything allocated to store the initiator,
+ * for instance the cpuset.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
+ * when refering to accesses performed by CPU cores.
+ * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
+ * but users may for instance use it to provide custom information about
+ * host memory accesses performed by GPUs.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_set_value(hwloc_topology_t topology,
+                        hwloc_memattr_id_t attribute,
+                        hwloc_obj_t target_node,
+                        struct hwloc_location *initiator,
+                        unsigned long flags,
+                        hwloc_uint64_t value);
+
+/** \brief Return the target NUMA nodes that have some values for a given attribute.
+ *
+ * Return targets for the given attribute in the \p targets array
+ * (for the given initiator if any).
+ * If \p values is not \c NULL, the corresponding attribute values
+ * are stored in the array it points to.
+ *
+ * On input, \p nr points to the number of targets that may be stored
+ * in the array \p targets (and \p values).
+ * On output, \p nr points to the number of targets (and values) that
+ * were actually found, even if some of them couldn't be stored in the array.
+ * Targets that couldn't be stored are ignored, but the function still
+ * returns success (\c 0). The caller may find out by comparing the value pointed
+ * by \p nr before and after the function call.
+ *
+ * The returned targets should not be modified or freed,
+ * they belong to the topology.
+ *
+ * Argument \p initiator is ignored if the attribute does not relate to a specific
+ * initiator (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR).
+ * Otherwise \p initiator may be non \c NULL to report only targets
+ * that have a value for that initiator.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * \note This function is meant for tools and debugging (listing internal information)
+ * rather than for application queries. Applications should rather select useful
+ * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute
+ * values.
+ *
+ * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
+ * when refering to accesses performed by CPU cores.
+ * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
+ * but users may for instance use it to provide custom information about
+ * host memory accesses performed by GPUs.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_targets(hwloc_topology_t topology,
+                          hwloc_memattr_id_t attribute,
+                          struct hwloc_location *initiator,
+                          unsigned long flags,
+                          unsigned *nrp, hwloc_obj_t *targets, hwloc_uint64_t *values);
+
+/** \brief Return the initiators that have values for a given attribute for a specific target NUMA node.
+ *
+ * Return initiators for the given attribute and target node in the
+ * \p initiators array.
+ * If \p values is not \c NULL, the corresponding attribute values
+ * are stored in the array it points to.
+ *
+ * On input, \p nr points to the number of initiators that may be stored
+ * in the array \p initiators (and \p values).
+ * On output, \p nr points to the number of initiators (and values) that
+ * were actually found, even if some of them couldn't be stored in the array.
+ * Initiators that couldn't be stored are ignored, but the function still
+ * returns success (\c 0). The caller may find out by comparing the value pointed
+ * by \p nr before and after the function call.
+ *
+ * The returned initiators should not be modified or freed,
+ * they belong to the topology.
+ *
+ * \p flags must be \c 0 for now.
+ *
+ * If the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
+ * no initiator is returned.
+ *
+ * \note This function is meant for tools and debugging (listing internal information)
+ * rather than for application queries. Applications should rather select useful
+ * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute
+ * values for some relevant initiators.
+ */
+HWLOC_DECLSPEC int
+hwloc_memattr_get_initiators(hwloc_topology_t topology,
+                             hwloc_memattr_id_t attribute,
+                             hwloc_obj_t target_node,
+                             unsigned long flags,
+                             unsigned *nr, struct hwloc_location *initiators, hwloc_uint64_t *values);
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_MEMATTR_H */
diff --git a/deps/hwloc/include/hwloc/rename.h b/deps/hwloc/include/hwloc/rename.h
new file mode 100644
index 000000000..9f3d5f60c
--- /dev/null
+++ b/deps/hwloc/include/hwloc/rename.h
@@ -0,0 +1,896 @@
+/*
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2010-2021 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_RENAME_H
+#define HWLOC_RENAME_H
+
+#include "hwloc/autogen/config.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Only enact these defines if we're actually renaming the symbols
+   (i.e., avoid trying to have no-op defines if we're *not*
+   renaming). */
+
+#if HWLOC_SYM_TRANSFORM
+
+/* Use a preprocessor two-step in order to get the prefixing right.
+   Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming
+   things. */
+
+#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b)
+#define HWLOC_MUNGE_NAME2(a, b) a ## b
+#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name)
+/* FIXME: should be "HWLOC_ ## name" below, unchanged because it doesn't matter much and could break some embedders hacks */
+#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name)
+
+/* Now define all the "real" names to be the prefixed names.  This
+   allows us to use the real names throughout the code base (i.e.,
+   "hwloc_<foo>"); the preprocessor will adjust to have the prefixed
+   name under the covers. */
+
+/* Names from hwloc.h */
+
+#define hwloc_get_api_version HWLOC_NAME(get_api_version)
+
+#define hwloc_topology HWLOC_NAME(topology)
+#define hwloc_topology_t HWLOC_NAME(topology_t)
+
+#define hwloc_cpuset_t HWLOC_NAME(cpuset_t)
+#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t)
+#define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
+#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
+
+#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
+#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_MEMCACHE HWLOC_NAME_CAPS(OBJ_MEMCACHE)
+#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
+#define HWLOC_OBJ_DIE HWLOC_NAME_CAPS(OBJ_DIE)
+#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
+#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_L1CACHE HWLOC_NAME_CAPS(OBJ_L1CACHE)
+#define HWLOC_OBJ_L2CACHE HWLOC_NAME_CAPS(OBJ_L2CACHE)
+#define HWLOC_OBJ_L3CACHE HWLOC_NAME_CAPS(OBJ_L3CACHE)
+#define HWLOC_OBJ_L4CACHE HWLOC_NAME_CAPS(OBJ_L4CACHE)
+#define HWLOC_OBJ_L5CACHE HWLOC_NAME_CAPS(OBJ_L5CACHE)
+#define HWLOC_OBJ_L1ICACHE HWLOC_NAME_CAPS(OBJ_L1ICACHE)
+#define HWLOC_OBJ_L2ICACHE HWLOC_NAME_CAPS(OBJ_L2ICACHE)
+#define HWLOC_OBJ_L3ICACHE HWLOC_NAME_CAPS(OBJ_L3ICACHE)
+#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
+#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
+#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
+#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE)
+#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE)
+#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX)
+#define hwloc_obj_type_t HWLOC_NAME(obj_type_t)
+
+#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e)
+#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t)
+#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED)
+#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA)
+#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION)
+
+#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e)
+#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t)
+#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST)
+#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI)
+
+#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e)
+#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t)
+#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK)
+#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU)
+#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK)
+#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS)
+#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA)
+#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC)
+
+#define hwloc_compare_types HWLOC_NAME(compare_types)
+
+#define hwloc_obj HWLOC_NAME(obj)
+#define hwloc_obj_t HWLOC_NAME(obj_t)
+
+#define hwloc_info_s HWLOC_NAME(info_s)
+
+#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_numanode_attr_s HWLOC_NAME(numanode_attr_s)
+#define hwloc_memory_page_type_s HWLOC_NAME(memory_page_type_s)
+#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
+#define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
+#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
+#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s)
+#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s)
+
+#define hwloc_topology_init HWLOC_NAME(topology_init)
+#define hwloc_topology_load HWLOC_NAME(topology_load)
+#define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
+#define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_abi_check HWLOC_NAME(topology_abi_check)
+#define hwloc_topology_check HWLOC_NAME(topology_check)
+
+#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
+
+#define HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WITH_DISALLOWED)
+#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES)
+#define HWLOC_TOPOLOGY_FLAG_IMPORT_SUPPORT HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IMPORT_SUPPORT)
+
+#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
+#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
+#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
+#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+#define hwloc_topology_components_flag_e HWLOC_NAME(hwloc_topology_components_flag_e)
+#define HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST HWLOC_NAME_CAPS(TOPOLOGY_COMPONENTS_FLAG_BLACKLIST)
+#define hwloc_topology_set_components HWLOC_NAME(topology_set_components)
+
+#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
+#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
+#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags)
+#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support)
+#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support)
+#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
+#define hwloc_topology_misc_support HWLOC_NAME(topology_misc_support)
+#define hwloc_topology_support HWLOC_NAME(topology_support)
+#define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
+
+#define hwloc_type_filter_e HWLOC_NAME(type_filter_e)
+#define HWLOC_TYPE_FILTER_KEEP_ALL HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_ALL)
+#define HWLOC_TYPE_FILTER_KEEP_NONE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_NONE)
+#define HWLOC_TYPE_FILTER_KEEP_STRUCTURE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_STRUCTURE)
+#define HWLOC_TYPE_FILTER_KEEP_IMPORTANT HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_IMPORTANT)
+#define hwloc_topology_set_type_filter HWLOC_NAME(topology_set_type_filter)
+#define hwloc_topology_get_type_filter HWLOC_NAME(topology_get_type_filter)
+#define hwloc_topology_set_all_types_filter HWLOC_NAME(topology_set_all_types_filter)
+#define hwloc_topology_set_cache_types_filter HWLOC_NAME(topology_set_cache_types_filter)
+#define hwloc_topology_set_icache_types_filter HWLOC_NAME(topology_set_icache_types_filter)
+#define hwloc_topology_set_io_types_filter HWLOC_NAME(topology_set_io_types_filter)
+
+#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
+#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
+
+#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
+#define HWLOC_RESTRICT_FLAG_REMOVE_CPULESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_CPULESS)
+#define HWLOC_RESTRICT_FLAG_BYNODESET HWLOC_NAME_CAPS(RESTRICT_FLAG_BYNODESET)
+#define HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_MEMLESS)
+#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
+#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
+#define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
+
+#define hwloc_allow_flags_e HWLOC_NAME(allow_flags_e)
+#define HWLOC_ALLOW_FLAG_ALL HWLOC_NAME_CAPS(ALLOW_FLAG_ALL)
+#define HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS HWLOC_NAME_CAPS(ALLOW_FLAG_LOCAL_RESTRICTIONS)
+#define HWLOC_ALLOW_FLAG_CUSTOM HWLOC_NAME_CAPS(ALLOW_FLAG_CUSTOM)
+#define hwloc_topology_allow HWLOC_NAME(topology_allow)
+
+#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
+#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
+#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
+#define hwloc_topology_refresh HWLOC_NAME(topology_refresh)
+
+#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
+#define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+#define hwloc_get_memory_parents_depth HWLOC_NAME(get_memory_parents_depth)
+
+#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
+#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
+#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE)
+#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE)
+#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
+#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
+#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+#define HWLOC_TYPE_DEPTH_NUMANODE HWLOC_NAME_CAPS(TYPE_DEPTH_NUMANODE)
+#define HWLOC_TYPE_DEPTH_MEMCACHE HWLOC_NAME_CAPS(TYPE_DEPTH_MEMCACHE)
+
+#define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
+#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
+#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type)
+
+#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth )
+#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
+
+#define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
+#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
+#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
+#define hwloc_type_sscanf HWLOC_NAME(type_sscanf)
+#define hwloc_type_sscanf_as_depth HWLOC_NAME(type_sscanf_as_depth)
+
+#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
+#define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
+
+#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS)
+#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD)
+#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT)
+#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND)
+
+#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t)
+
+#define hwloc_set_cpubind HWLOC_NAME(set_cpubind)
+#define hwloc_get_cpubind HWLOC_NAME(get_cpubind)
+#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind)
+#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind)
+#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind)
+#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind)
+
+#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location)
+#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location)
+
+#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT)
+#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
+#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
+#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
+#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
+#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
+
+#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t)
+
+#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS)
+#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD)
+#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
+#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
+#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+#define HWLOC_MEMBIND_BYNODESET HWLOC_NAME_CAPS(MEMBIND_BYNODESET)
+
+#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
+
+#define hwloc_set_membind HWLOC_NAME(set_membind)
+#define hwloc_get_membind HWLOC_NAME(get_membind)
+#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
+#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
+#define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
+#define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
+#define hwloc_get_area_memlocation HWLOC_NAME(get_area_memlocation)
+#define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
+#define hwloc_alloc HWLOC_NAME(alloc)
+#define hwloc_free HWLOC_NAME(free)
+
+#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj)
+#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev)
+#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid)
+#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring)
+#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
+#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
+#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
+
+/* hwloc/bitmap.h */
+
+#define hwloc_bitmap_s HWLOC_NAME(bitmap_s)
+#define hwloc_bitmap_t HWLOC_NAME(bitmap_t)
+#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t)
+
+#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc)
+#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full)
+#define hwloc_bitmap_free HWLOC_NAME(bitmap_free)
+#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup)
+#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy)
+#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf)
+#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf)
+#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf)
+#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf)
+#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf)
+#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf)
+#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf)
+#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf)
+#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf)
+#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
+#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
+#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
+#define hwloc_bitmap_from_ulongs HWLOC_NAME(bitmap_from_ulongs)
+#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
+#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
+#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_to_ulongs HWLOC_NAME(bitmap_to_ulongs)
+#define hwloc_bitmap_nr_ulongs HWLOC_NAME(bitmap_nr_ulongs)
+#define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
+#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
+#define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
+#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range)
+#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong)
+#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr)
+#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range)
+#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset)
+#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero)
+#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull)
+#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal)
+#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects)
+#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded)
+#define hwloc_bitmap_or HWLOC_NAME(bitmap_or)
+#define hwloc_bitmap_and HWLOC_NAME(bitmap_and)
+#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot)
+#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor)
+#define hwloc_bitmap_not HWLOC_NAME(bitmap_not)
+#define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
+#define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
+#define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_first_unset HWLOC_NAME(bitmap_first_unset)
+#define hwloc_bitmap_last_unset HWLOC_NAME(bitmap_last_unset)
+#define hwloc_bitmap_next_unset HWLOC_NAME(bitmap_next_unset)
+#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
+#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
+#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
+#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight)
+
+/* hwloc/helper.h */
+
+#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth)
+#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth)
+#define hwloc_get_root_obj HWLOC_NAME(get_root_obj)
+#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth)
+#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type)
+#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth)
+#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type)
+#define hwloc_bitmap_singlify_per_core HWLOC_NAME(bitmap_singlify_by_core)
+#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index)
+#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index)
+#define hwloc_get_next_child HWLOC_NAME(get_next_child)
+#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj)
+#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree)
+#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset)
+#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset)
+#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth)
+#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type)
+#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth)
+#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type)
+#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth)
+#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type)
+#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset)
+#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset)
+#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
+#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
+#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_obj_type_is_normal HWLOC_NAME(obj_type_is_normal)
+#define hwloc_obj_type_is_memory HWLOC_NAME(obj_type_is_memory)
+#define hwloc_obj_type_is_io HWLOC_NAME(obj_type_is_io)
+#define hwloc_obj_type_is_cache HWLOC_NAME(obj_type_is_cache)
+#define hwloc_obj_type_is_dcache HWLOC_NAME(obj_type_is_dcache)
+#define hwloc_obj_type_is_icache HWLOC_NAME(obj_type_is_icache)
+#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
+#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
+#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
+#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs)
+#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type)
+#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type)
+#define hwloc_get_obj_with_same_locality HWLOC_NAME(get_obj_with_same_locality)
+#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e)
+#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE)
+#define hwloc_distrib HWLOC_NAME(distrib)
+#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy)
+#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset)
+#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset)
+#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset)
+#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset)
+#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset)
+#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
+#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
+#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
+#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
+
+/* memattrs.h */
+
+#define hwloc_memattr_id_e HWLOC_NAME(memattr_id_e)
+#define HWLOC_MEMATTR_ID_CAPACITY HWLOC_NAME_CAPS(MEMATTR_ID_CAPACITY)
+#define HWLOC_MEMATTR_ID_LOCALITY HWLOC_NAME_CAPS(MEMATTR_ID_LOCALITY)
+#define HWLOC_MEMATTR_ID_BANDWIDTH HWLOC_NAME_CAPS(MEMATTR_ID_BANDWIDTH)
+#define HWLOC_MEMATTR_ID_LATENCY HWLOC_NAME_CAPS(MEMATTR_ID_LATENCY)
+
+#define hwloc_memattr_id_t HWLOC_NAME(memattr_id_t)
+#define hwloc_memattr_get_by_name HWLOC_NAME(memattr_get_by_name)
+
+#define hwloc_location HWLOC_NAME(location)
+#define hwloc_location_type_e HWLOC_NAME(location_type_e)
+#define HWLOC_LOCATION_TYPE_OBJECT HWLOC_NAME_CAPS(LOCATION_TYPE_OBJECT)
+#define HWLOC_LOCATION_TYPE_CPUSET HWLOC_NAME_CAPS(LOCATION_TYPE_CPUSET)
+#define hwloc_location_u HWLOC_NAME(location_u)
+
+#define hwloc_memattr_get_value HWLOC_NAME(memattr_get_value)
+#define hwloc_memattr_get_best_target HWLOC_NAME(memattr_get_best_target)
+#define hwloc_memattr_get_best_initiator HWLOC_NAME(memattr_get_best_initiator)
+
+#define hwloc_local_numanode_flag_e HWLOC_NAME(local_numanode_flag_e)
+#define HWLOC_LOCAL_NUMANODE_FLAG_LARGER_LOCALITY HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_LARGER_LOCALITY)
+#define HWLOC_LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_SMALLER_LOCALITY)
+#define HWLOC_LOCAL_NUMANODE_FLAG_ALL HWLOC_NAME_CAPS(LOCAL_NUMANODE_FLAG_ALL)
+#define hwloc_get_local_numanode_objs HWLOC_NAME(get_local_numanode_objs)
+
+#define hwloc_memattr_get_name HWLOC_NAME(memattr_get_name)
+#define hwloc_memattr_get_flags HWLOC_NAME(memattr_get_flags)
+#define hwloc_memattr_flag_e HWLOC_NAME(memattr_flag_e)
+#define HWLOC_MEMATTR_FLAG_HIGHER_FIRST HWLOC_NAME_CAPS(MEMATTR_FLAG_HIGHER_FIRST)
+#define HWLOC_MEMATTR_FLAG_LOWER_FIRST HWLOC_NAME_CAPS(MEMATTR_FLAG_LOWER_FIRST)
+#define HWLOC_MEMATTR_FLAG_NEED_INITIATOR HWLOC_NAME_CAPS(MEMATTR_FLAG_NEED_INITIATOR)
+#define hwloc_memattr_register HWLOC_NAME(memattr_register)
+#define hwloc_memattr_set_value HWLOC_NAME(memattr_set_value)
+#define hwloc_memattr_get_targets HWLOC_NAME(memattr_get_targets)
+#define hwloc_memattr_get_initiators HWLOC_NAME(memattr_get_initiators)
+
+/* cpukinds.h */
+
+#define hwloc_cpukinds_get_nr HWLOC_NAME(cpukinds_get_nr)
+#define hwloc_cpukinds_get_by_cpuset HWLOC_NAME(cpukinds_get_by_cpuset)
+#define hwloc_cpukinds_get_info HWLOC_NAME(cpukinds_get_info)
+#define hwloc_cpukinds_register HWLOC_NAME(cpukinds_register)
+
+/* export.h */
+
+#define hwloc_topology_export_xml_flags_e HWLOC_NAME(topology_export_xml_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_XML_FLAG_V1)
+#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
+#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
+#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
+#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback)
+#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata)
+#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64)
+#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback)
+
+#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)
+#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
+
+/* distances.h */
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+
+#define hwloc_distances_kind_e HWLOC_NAME(distances_kind_e)
+#define HWLOC_DISTANCES_KIND_FROM_OS HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_OS)
+#define HWLOC_DISTANCES_KIND_FROM_USER HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_USER)
+#define HWLOC_DISTANCES_KIND_MEANS_LATENCY HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_LATENCY)
+#define HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_BANDWIDTH)
+#define HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES HWLOC_NAME_CAPS(DISTANCES_KIND_HETEROGENEOUS_TYPES)
+
+#define hwloc_distances_get HWLOC_NAME(distances_get)
+#define hwloc_distances_get_by_depth HWLOC_NAME(distances_get_by_depth)
+#define hwloc_distances_get_by_type HWLOC_NAME(distances_get_by_type)
+#define hwloc_distances_get_by_name HWLOC_NAME(distances_get_by_name)
+#define hwloc_distances_get_name HWLOC_NAME(distances_get_name)
+#define hwloc_distances_release HWLOC_NAME(distances_release)
+#define hwloc_distances_obj_index HWLOC_NAME(distances_obj_index)
+#define hwloc_distances_obj_pair_values HWLOC_NAME(distances_pair_values)
+
+#define hwloc_distances_transform_e HWLOC_NAME(distances_transform_e)
+#define HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL HWLOC_NAME_CAPS(DISTANCES_TRANSFORM_REMOVE_NULL)
+#define HWLOC_DISTANCES_TRANSFORM_LINKS HWLOC_NAME_CAPS(DISTANCES_TRANSFORM_LINKS)
+#define hwloc_distances_transform HWLOC_NAME(distances_transform)
+
+#define hwloc_distances_add_flag_e HWLOC_NAME(distances_add_flag_e)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP_INACCURATE)
+
+#define hwloc_distances_add_handle_t HWLOC_NAME(distances_add_handle_t)
+#define hwloc_distances_add_create HWLOC_NAME(distances_add_create)
+#define hwloc_distances_add_values HWLOC_NAME(distances_add_values)
+#define hwloc_distances_add_commit HWLOC_NAME(distances_add_commit)
+
+#define hwloc_distances_remove HWLOC_NAME(distances_remove)
+#define hwloc_distances_remove_by_depth HWLOC_NAME(distances_remove_by_depth)
+#define hwloc_distances_remove_by_type HWLOC_NAME(distances_remove_by_type)
+#define hwloc_distances_release_remove HWLOC_NAME(distances_release_remove)
+
+/* diff.h */
+
+#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
+#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO)
+#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u)
+#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s)
+#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s)
+#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s)
+#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e)
+#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR)
+#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX)
+#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u)
+#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t)
+#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s)
+#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s)
+#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s)
+#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build)
+#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e)
+#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE)
+#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply)
+#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy)
+#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml)
+#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml)
+#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
+#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
+
+/* shmem.h */
+
+#define hwloc_shmem_topology_get_length HWLOC_NAME(shmem_topology_get_length)
+#define hwloc_shmem_topology_write HWLOC_NAME(shmem_topology_write)
+#define hwloc_shmem_topology_adopt HWLOC_NAME(shmem_topology_adopt)
+
+/* glibc-sched.h */
+
+#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
+#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity)
+
+/* linux-libnuma.h */
+
+#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs)
+#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs)
+#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs)
+#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs)
+#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask)
+#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask)
+#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask)
+#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask)
+
+/* linux.h */
+
+#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
+#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
+#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+#define hwloc_linux_read_path_as_cpumask HWLOC_NAME(linux_read_file_cpumask)
+
+/* windows.h */
+
+#define hwloc_windows_get_nr_processor_groups HWLOC_NAME(windows_get_nr_processor_groups)
+#define hwloc_windows_get_processor_group_cpuset HWLOC_NAME(windows_get_processor_group_cpuset)
+
+/* openfabrics-verbs.h */
+
+#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset)
+#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
+#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
+
+/* opencl.h */
+
+#define hwloc_cl_device_topology_amd HWLOC_NAME(cl_device_topology_amd)
+#define hwloc_opencl_get_device_pci_busid HWLOC_NAME(opencl_get_device_pci_ids)
+#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
+#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
+#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
+
+/* cuda.h */
+
+#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids)
+#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset)
+#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev)
+#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev)
+#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index)
+
+/* cudart.h */
+
+#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids)
+#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset)
+#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev)
+#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index)
+
+/* nvml.h */
+
+#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset)
+#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev)
+#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index)
+
+/* rsmi.h */
+
+#define hwloc_rsmi_get_device_cpuset HWLOC_NAME(rsmi_get_device_cpuset)
+#define hwloc_rsmi_get_device_osdev HWLOC_NAME(rsmi_get_device_osdev)
+#define hwloc_rsmi_get_device_osdev_by_index HWLOC_NAME(rsmi_get_device_osdev_by_index)
+
+/* levelzero.h */
+
+#define hwloc_levelzero_get_device_cpuset HWLOC_NAME(levelzero_get_device_cpuset)
+#define hwloc_levelzero_get_device_osdev HWLOC_NAME(levelzero_get_device_osdev)
+
+/* gl.h */
+
+#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device)
+#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name)
+#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev)
+
+/* hwloc/plugins.h */
+
+#define hwloc_disc_phase_e HWLOC_NAME(disc_phase_e)
+#define HWLOC_DISC_PHASE_GLOBAL HWLOC_NAME_CAPS(DISC_PHASE_GLOBAL)
+#define HWLOC_DISC_PHASE_CPU HWLOC_NAME_CAPS(DISC_PHASE_CPU)
+#define HWLOC_DISC_PHASE_MEMORY HWLOC_NAME_CAPS(DISC_PHASE_MEMORY)
+#define HWLOC_DISC_PHASE_PCI HWLOC_NAME_CAPS(DISC_PHASE_PCI)
+#define HWLOC_DISC_PHASE_IO HWLOC_NAME_CAPS(DISC_PHASE_IO)
+#define HWLOC_DISC_PHASE_MISC HWLOC_NAME_CAPS(DISC_PHASE_MISC)
+#define HWLOC_DISC_PHASE_ANNOTATE HWLOC_NAME_CAPS(DISC_PHASE_ANNOTATE)
+#define HWLOC_DISC_PHASE_TWEAK HWLOC_NAME_CAPS(DISC_PHASE_TWEAK)
+#define hwloc_disc_phase_t HWLOC_NAME(disc_phase_t)
+#define hwloc_disc_component HWLOC_NAME(disc_component)
+
+#define hwloc_disc_status_flag_e HWLOC_NAME(disc_status_flag_e)
+#define HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES HWLOC_NAME_CAPS(DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES)
+#define hwloc_disc_status HWLOC_NAME(disc_status)
+
+#define hwloc_backend HWLOC_NAME(backend)
+
+#define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
+#define hwloc_backend_enable HWLOC_NAME(backend_enable)
+
+#define hwloc_component_type_e HWLOC_NAME(component_type_e)
+#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
+#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML)
+#define hwloc_component_type_t HWLOC_NAME(component_type_t)
+#define hwloc_component HWLOC_NAME(component)
+
+#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace)
+
+#define hwloc_hide_errors HWLOC_NAME(hide_errors)
+#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset)
+#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
+#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
+#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+#define hwloc_topology_reconnect HWLOC_NAME(topology_reconnect)
+
+#define hwloc_filter_check_pcidev_subtype_important HWLOC_NAME(filter_check_pcidev_subtype_important)
+#define hwloc_filter_check_osdev_subtype_important HWLOC_NAME(filter_check_osdev_subtype_important)
+#define hwloc_filter_check_keep_object_type HWLOC_NAME(filter_check_keep_object_type)
+#define hwloc_filter_check_keep_object HWLOC_NAME(filter_check_keep_object)
+
+#define hwloc_pcidisc_find_cap HWLOC_NAME(pcidisc_find_cap)
+#define hwloc_pcidisc_find_linkspeed HWLOC_NAME(pcidisc_find_linkspeed)
+#define hwloc_pcidisc_check_bridge_type HWLOC_NAME(pcidisc_check_bridge_type)
+#define hwloc_pcidisc_find_bridge_buses HWLOC_NAME(pcidisc_find_bridge_buses)
+#define hwloc_pcidisc_tree_insert_by_busid HWLOC_NAME(pcidisc_tree_insert_by_busid)
+#define hwloc_pcidisc_tree_attach HWLOC_NAME(pcidisc_tree_attach)
+
+#define hwloc_pci_find_by_busid HWLOC_NAME(pcidisc_find_by_busid)
+#define hwloc_pci_find_parent_by_busid HWLOC_NAME(pcidisc_find_busid_parent)
+
+#define hwloc_backend_distances_add_handle_t HWLOC_NAME(backend_distances_add_handle_t)
+#define hwloc_backend_distances_add_create HWLOC_NAME(backend_distances_add_create)
+#define hwloc_backend_distances_add_values HWLOC_NAME(backend_distances_add_values)
+#define hwloc_backend_distances_add_commit HWLOC_NAME(backend_distances_add_commit)
+
+/* hwloc/deprecated.h */
+
+#define hwloc_distances_add HWLOC_NAME(distances_add)
+
+#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
+
+/* private/debug.h */
+
+#define hwloc_debug_enabled HWLOC_NAME(debug_enabled)
+#define hwloc_debug HWLOC_NAME(debug)
+
+/* private/misc.h */
+
+#ifndef HWLOC_HAVE_CORRECT_SNPRINTF
+#define hwloc_snprintf HWLOC_NAME(snprintf)
+#endif
+#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
+#define hwloc_ffs32 HWLOC_NAME(ffs32)
+#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
+#define hwloc_flsl_manual HWLOC_NAME(flsl_manual)
+#define hwloc_fls32 HWLOC_NAME(fls32)
+#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32)
+#define hwloc_weight_long HWLOC_NAME(weight_long)
+#define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
+
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+#define hwloc_pci_class_string HWLOC_NAME(pci_class_string)
+#define hwloc_linux_pci_link_speed_from_string HWLOC_NAME(linux_pci_link_speed_from_string)
+
+#define hwloc_cache_type_by_depth_type HWLOC_NAME(cache_type_by_depth_type)
+#define hwloc__obj_type_is_normal HWLOC_NAME(_obj_type_is_normal)
+#define hwloc__obj_type_is_memory HWLOC_NAME(_obj_type_is_memory)
+#define hwloc__obj_type_is_io HWLOC_NAME(_obj_type_is_io)
+#define hwloc__obj_type_is_special HWLOC_NAME(_obj_type_is_special)
+
+#define hwloc__obj_type_is_cache HWLOC_NAME(_obj_type_is_cache)
+#define hwloc__obj_type_is_dcache HWLOC_NAME(_obj_type_is_dcache)
+#define hwloc__obj_type_is_icache HWLOC_NAME(_obj_type_is_icache)
+
+/* private/cpuid-x86.h */
+
+#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
+#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid)
+
+/* private/xml.h */
+
+#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose)
+
+#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s)
+#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t)
+#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff)
+#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
+#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
+#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
+#define hwloc__xml_export_data_s HWLOC_NAME(_xml_export_data_s)
+#define hwloc__xml_export_topology HWLOC_NAME(_xml_export_topology)
+#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
+
+#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
+#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
+
+#define hwloc__xml_imported_v1distances_s HWLOC_NAME(_xml_imported_v1distances_s)
+
+/* private/components.h */
+
+#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
+#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
+
+#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+#define hwloc_backends_find_callbacks HWLOC_NAME(backends_find_callbacks)
+
+#define hwloc_topology_components_init HWLOC_NAME(topology_components_init)
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+#define hwloc_topology_components_fini HWLOC_NAME(topology_components_fini)
+
+#define hwloc_components_init HWLOC_NAME(components_init)
+#define hwloc_components_fini HWLOC_NAME(components_fini)
+
+/* private/internal-private.h */
+
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_synthetic_component HWLOC_NAME(synthetic_component)
+
+#define hwloc_aix_component HWLOC_NAME(aix_component)
+#define hwloc_bgq_component HWLOC_NAME(bgq_component)
+#define hwloc_darwin_component HWLOC_NAME(darwin_component)
+#define hwloc_freebsd_component HWLOC_NAME(freebsd_component)
+#define hwloc_hpux_component HWLOC_NAME(hpux_component)
+#define hwloc_linux_component HWLOC_NAME(linux_component)
+#define hwloc_netbsd_component HWLOC_NAME(netbsd_component)
+#define hwloc_noos_component HWLOC_NAME(noos_component)
+#define hwloc_solaris_component HWLOC_NAME(solaris_component)
+#define hwloc_windows_component HWLOC_NAME(windows_component)
+#define hwloc_x86_component HWLOC_NAME(x86_component)
+
+#define hwloc_cuda_component HWLOC_NAME(cuda_component)
+#define hwloc_gl_component HWLOC_NAME(gl_component)
+#define hwloc_levelzero_component HWLOC_NAME(levelzero_component)
+#define hwloc_nvml_component HWLOC_NAME(nvml_component)
+#define hwloc_rsmi_component HWLOC_NAME(rsmi_component)
+#define hwloc_opencl_component HWLOC_NAME(opencl_component)
+#define hwloc_pci_component HWLOC_NAME(pci_component)
+
+#define hwloc_xml_libxml_component HWLOC_NAME(xml_libxml_component)
+#define hwloc_xml_nolibxml_component HWLOC_NAME(xml_nolibxml_component)
+
+/* private/private.h */
+
+#define hwloc_internal_location_s HWLOC_NAME(internal_location_s)
+
+#define hwloc_special_level_s HWLOC_NAME(special_level_s)
+
+#define hwloc_pci_forced_locality_s HWLOC_NAME(pci_forced_locality_s)
+#define hwloc_pci_locality_s HWLOC_NAME(pci_locality_s)
+
+#define hwloc_topology_forced_component_s HWLOC_NAME(topology_forced_component)
+
+#define hwloc_alloc_root_sets HWLOC_NAME(alloc_root_sets)
+#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
+#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
+#define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
+#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
+#define hwloc_fallback_memsize HWLOC_NAME(fallback_memsize)
+
+#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
+#define hwloc__reorder_children HWLOC_NAME(_reorder_children)
+
+#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
+#define hwloc_topology_clear HWLOC_NAME(topology_clear)
+
+#define hwloc__attach_memory_object HWLOC_NAME(insert_memory_object)
+
+#define hwloc_get_obj_by_type_and_gp_index HWLOC_NAME(get_obj_by_type_and_gp_index)
+
+#define hwloc_pci_discovery_init HWLOC_NAME(pci_discovery_init)
+#define hwloc_pci_discovery_prepare HWLOC_NAME(pci_discovery_prepare)
+#define hwloc_pci_discovery_exit HWLOC_NAME(pci_discovery_exit)
+#define hwloc_find_insert_io_parent_by_complete_cpuset HWLOC_NAME(hwloc_find_insert_io_parent_by_complete_cpuset)
+
+#define hwloc__add_info HWLOC_NAME(_add_info)
+#define hwloc__add_info_nodup HWLOC_NAME(_add_info_nodup)
+#define hwloc__move_infos HWLOC_NAME(_move_infos)
+#define hwloc__free_infos HWLOC_NAME(_free_infos)
+#define hwloc__tma_dup_infos HWLOC_NAME(_tma_dup_infos)
+
+#define hwloc_binding_hooks HWLOC_NAME(binding_hooks)
+#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks)
+#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks)
+
+#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks)
+#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
+#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
+#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
+#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
+#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
+#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
+#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
+#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
+
+#define hwloc_look_hardwired_fujitsu_k HWLOC_NAME(look_hardwired_fujitsu_k)
+#define hwloc_look_hardwired_fujitsu_fx10 HWLOC_NAME(look_hardwired_fujitsu_fx10)
+#define hwloc_look_hardwired_fujitsu_fx100 HWLOC_NAME(look_hardwired_fujitsu_fx100)
+
+#define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
+#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
+#define hwloc_free_object_and_children HWLOC_NAME(free_object_and_children)
+#define hwloc_free_object_siblings_and_children HWLOC_NAME(free_object_siblings_and_children)
+
+#define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
+#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
+#define hwloc_free_heap HWLOC_NAME(free_heap)
+#define hwloc_free_mmap HWLOC_NAME(free_mmap)
+#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
+
+#define hwloc_internal_distances_s HWLOC_NAME(internal_distances_s)
+#define hwloc_internal_distances_init HWLOC_NAME(internal_distances_init)
+#define hwloc_internal_distances_prepare HWLOC_NAME(internal_distances_prepare)
+#define hwloc_internal_distances_dup HWLOC_NAME(internal_distances_dup)
+#define hwloc_internal_distances_refresh HWLOC_NAME(internal_distances_refresh)
+#define hwloc_internal_distances_destroy HWLOC_NAME(internal_distances_destroy)
+#define hwloc_internal_distances_add HWLOC_NAME(internal_distances_add)
+#define hwloc_internal_distances_add_by_index HWLOC_NAME(internal_distances_add_by_index)
+#define hwloc_internal_distances_invalidate_cached_objs HWLOC_NAME(hwloc_internal_distances_invalidate_cached_objs)
+
+#define hwloc_internal_memattr_s HWLOC_NAME(internal_memattr_s)
+#define hwloc_internal_memattr_target_s HWLOC_NAME(internal_memattr_target_s)
+#define hwloc_internal_memattr_initiator_s HWLOC_NAME(internal_memattr_initiator_s)
+#define hwloc_internal_memattrs_init HWLOC_NAME(internal_memattrs_init)
+#define hwloc_internal_memattrs_prepare HWLOC_NAME(internal_memattrs_prepare)
+#define hwloc_internal_memattrs_dup HWLOC_NAME(internal_memattrs_dup)
+#define hwloc_internal_memattrs_destroy HWLOC_NAME(internal_memattrs_destroy)
+#define hwloc_internal_memattrs_need_refresh HWLOC_NAME(internal_memattrs_need_refresh)
+#define hwloc_internal_memattrs_refresh HWLOC_NAME(internal_memattrs_refresh)
+
+#define hwloc_internal_cpukind_s HWLOC_NAME(internal_cpukind_s)
+#define hwloc_internal_cpukinds_init HWLOC_NAME(internal_cpukinds_init)
+#define hwloc_internal_cpukinds_destroy HWLOC_NAME(internal_cpukinds_destroy)
+#define hwloc_internal_cpukinds_dup HWLOC_NAME(internal_cpukinds_dup)
+#define hwloc_internal_cpukinds_register HWLOC_NAME(internal_cpukinds_register)
+#define hwloc_internal_cpukinds_rank HWLOC_NAME(internal_cpukinds_rank)
+#define hwloc_internal_cpukinds_restrict HWLOC_NAME(internal_cpukinds_restrict)
+
+#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
+#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
+
+#define hwloc_progname HWLOC_NAME(progname)
+
+#define hwloc__topology_disadopt HWLOC_NAME(_topology_disadopt)
+#define hwloc__topology_dup HWLOC_NAME(_topology_dup)
+
+#define hwloc_tma HWLOC_NAME(tma)
+#define hwloc_tma_malloc HWLOC_NAME(tma_malloc)
+#define hwloc_tma_calloc HWLOC_NAME(tma_calloc)
+#define hwloc_tma_strdup HWLOC_NAME(tma_strdup)
+#define hwloc_bitmap_tma_dup HWLOC_NAME(bitmap_tma_dup)
+
+/* private/solaris-chiptype.h */
+
+#define hwloc_solaris_chip_info_s HWLOC_NAME(solaris_chip_info_s)
+#define hwloc_solaris_get_chip_info HWLOC_NAME(solaris_get_chip_info)
+
+#endif /* HWLOC_SYM_TRANSFORM */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_RENAME_H */
diff --git a/deps/hwloc/lib/libhwloc.a b/deps/hwloc/lib/libhwloc.a
new file mode 100644
index 000000000..540a7d065
Binary files /dev/null and b/deps/hwloc/lib/libhwloc.a differ
diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy
new file mode 100755
index 000000000..218ff80c3
Binary files /dev/null and b/deps/mpi/bin/hydra_bstrap_proxy differ
diff --git a/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver
similarity index 66%
rename from mpi/bin/hydra_nameserver
rename to deps/mpi/bin/hydra_nameserver
index 6bff3150f..028fa08df 100755
Binary files a/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ
diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy
new file mode 100755
index 000000000..14efe5656
Binary files /dev/null and b/deps/mpi/bin/hydra_pmi_proxy differ
diff --git a/mpi/bin/mpicc b/deps/mpi/bin/mpicc
similarity index 100%
rename from mpi/bin/mpicc
rename to deps/mpi/bin/mpicc
diff --git a/mpi/bin/mpicxx b/deps/mpi/bin/mpicxx
similarity index 100%
rename from mpi/bin/mpicxx
rename to deps/mpi/bin/mpicxx
diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec
new file mode 100755
index 000000000..8826a76d3
Binary files /dev/null and b/deps/mpi/bin/mpiexec differ
diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra
new file mode 100755
index 000000000..8826a76d3
Binary files /dev/null and b/deps/mpi/bin/mpiexec.hydra differ
diff --git a/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc
similarity index 99%
rename from mpi/bin/mpigcc
rename to deps/mpi/bin/mpigcc
index 4e0233819..9a306a10a 100755
--- a/mpi/bin/mpigcc
+++ b/deps/mpi/bin/mpigcc
@@ -104,7 +104,7 @@ CFLAGS=""
 CPPFLAGS=""
 LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.2"
+MPIVERSION="2021.3"
 MPILIBNAME="mpi"                           
 
 
diff --git a/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx
similarity index 99%
rename from mpi/bin/mpigxx
rename to deps/mpi/bin/mpigxx
index 3841ece4f..ca11d0e20 100755
--- a/mpi/bin/mpigxx
+++ b/deps/mpi/bin/mpigxx
@@ -101,7 +101,7 @@ MPICH_VERSION="3.3"
 CXXFLAGS=""
 LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.2"
+MPIVERSION="2021.3"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
diff --git a/mpi/bin/mpiicc b/deps/mpi/bin/mpiicc
similarity index 99%
rename from mpi/bin/mpiicc
rename to deps/mpi/bin/mpiicc
index 3922dad5b..c623722ee 100755
--- a/mpi/bin/mpiicc
+++ b/deps/mpi/bin/mpiicc
@@ -122,7 +122,7 @@ MPILIBNAME="mpi"
 PMPILIBNAME="pmpi"
 
 # MPIVERSION is the version of the MPICH2 library that mpicc is intended for
-MPIVERSION="2021.2"
+MPIVERSION="2021.3"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/mpi/bin/mpiicpc b/deps/mpi/bin/mpiicpc
similarity index 99%
rename from mpi/bin/mpiicpc
rename to deps/mpi/bin/mpiicpc
index 62667fb05..13695ab64 100755
--- a/mpi/bin/mpiicpc
+++ b/deps/mpi/bin/mpiicpc
@@ -121,7 +121,7 @@ PMPILIBNAME="pmpi"
 MPICXXLIBNAME="mpicxx"
 
 # MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
-MPIVERSION="2021.2"
+MPIVERSION="2021.3"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/mpi/bin/mpirun b/deps/mpi/bin/mpirun
similarity index 100%
rename from mpi/bin/mpirun
rename to deps/mpi/bin/mpirun
diff --git a/mpi/etc/tuning_clx-ap_ofi.dat b/deps/mpi/etc/tuning_clx-ap_ofi.dat
similarity index 100%
rename from mpi/etc/tuning_clx-ap_ofi.dat
rename to deps/mpi/etc/tuning_clx-ap_ofi.dat
diff --git a/mpi/etc/tuning_clx-ap_shm-ofi.dat b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat
similarity index 56%
rename from mpi/etc/tuning_clx-ap_shm-ofi.dat
rename to deps/mpi/etc/tuning_clx-ap_shm-ofi.dat
index a3154cb4a..a6988c57a 100755
Binary files a/mpi/etc/tuning_clx-ap_shm-ofi.dat and b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat differ
diff --git a/deps/mpi/etc/tuning_clx-ap_shm.dat b/deps/mpi/etc/tuning_clx-ap_shm.dat
new file mode 100755
index 000000000..95cac35d0
Binary files /dev/null and b/deps/mpi/etc/tuning_clx-ap_shm.dat differ
diff --git a/mpi/etc/tuning_generic_ofi.dat b/deps/mpi/etc/tuning_generic_ofi.dat
similarity index 100%
rename from mpi/etc/tuning_generic_ofi.dat
rename to deps/mpi/etc/tuning_generic_ofi.dat
diff --git a/mpi/etc/tuning_generic_shm-ofi.dat b/deps/mpi/etc/tuning_generic_shm-ofi.dat
similarity index 100%
rename from mpi/etc/tuning_generic_shm-ofi.dat
rename to deps/mpi/etc/tuning_generic_shm-ofi.dat
diff --git a/mpi/etc/tuning_generic_shm.dat b/deps/mpi/etc/tuning_generic_shm.dat
similarity index 100%
rename from mpi/etc/tuning_generic_shm.dat
rename to deps/mpi/etc/tuning_generic_shm.dat
diff --git a/mpi/etc/tuning_knl_ofi.dat b/deps/mpi/etc/tuning_knl_ofi.dat
similarity index 100%
rename from mpi/etc/tuning_knl_ofi.dat
rename to deps/mpi/etc/tuning_knl_ofi.dat
diff --git a/mpi/etc/tuning_knl_shm-ofi.dat b/deps/mpi/etc/tuning_knl_shm-ofi.dat
similarity index 100%
rename from mpi/etc/tuning_knl_shm-ofi.dat
rename to deps/mpi/etc/tuning_knl_shm-ofi.dat
diff --git a/mpi/etc/tuning_knl_shm.dat b/deps/mpi/etc/tuning_knl_shm.dat
similarity index 100%
rename from mpi/etc/tuning_knl_shm.dat
rename to deps/mpi/etc/tuning_knl_shm.dat
diff --git a/mpi/etc/tuning_skx_ofi.dat b/deps/mpi/etc/tuning_skx_ofi.dat
similarity index 100%
rename from mpi/etc/tuning_skx_ofi.dat
rename to deps/mpi/etc/tuning_skx_ofi.dat
diff --git a/mpi/etc/tuning_skx_shm-ofi.dat b/deps/mpi/etc/tuning_skx_shm-ofi.dat
similarity index 79%
rename from mpi/etc/tuning_skx_shm-ofi.dat
rename to deps/mpi/etc/tuning_skx_shm-ofi.dat
index 74bfe7ba1..f9e770897 100755
Binary files a/mpi/etc/tuning_skx_shm-ofi.dat and b/deps/mpi/etc/tuning_skx_shm-ofi.dat differ
diff --git a/mpi/etc/tuning_skx_shm.dat b/deps/mpi/etc/tuning_skx_shm.dat
similarity index 79%
rename from mpi/etc/tuning_skx_shm.dat
rename to deps/mpi/etc/tuning_skx_shm.dat
index 74bfe7ba1..f9e770897 100755
Binary files a/mpi/etc/tuning_skx_shm.dat and b/deps/mpi/etc/tuning_skx_shm.dat differ
diff --git a/mpi/include/mpi.h b/deps/mpi/include/mpi.h
old mode 100755
new mode 100644
similarity index 99%
rename from mpi/include/mpi.h
rename to deps/mpi/include/mpi.h
index 39095b742..658e5a3a5
--- a/mpi/include/mpi.h
+++ b/deps/mpi/include/mpi.h
@@ -1,5 +1,5 @@
 /*
-    Copyright 2003-2021 Intel Corporation.
+    Copyright Intel Corporation.
     
     This software and the related documents are Intel copyrighted materials, and
     your use of them is governed by the express license under which they were
@@ -580,8 +580,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.2.0"
-#define I_MPI_NUMVERSION 20210200300
+#define I_MPI_VERSION "2021.3.0"
+#define I_MPI_NUMVERSION 20210300300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
diff --git a/mpi/include/mpicxx.h b/deps/mpi/include/mpicxx.h
old mode 100755
new mode 100644
similarity index 99%
rename from mpi/include/mpicxx.h
rename to deps/mpi/include/mpicxx.h
index 3d27a661b..07c4ebce3
--- a/mpi/include/mpicxx.h
+++ b/deps/mpi/include/mpicxx.h
@@ -1,5 +1,5 @@
 /*
-    Copyright 2003-2021 Intel Corporation.
+    Copyright Intel Corporation.
     
     This software and the related documents are Intel copyrighted materials, and
     your use of them is governed by the express license under which they were
diff --git a/mpi/include/mpio.h b/deps/mpi/include/mpio.h
old mode 100755
new mode 100644
similarity index 99%
rename from mpi/include/mpio.h
rename to deps/mpi/include/mpio.h
index 74ce84e70..2e35d8913
--- a/mpi/include/mpio.h
+++ b/deps/mpi/include/mpio.h
@@ -1,5 +1,5 @@
 /*
-    Copyright 2003-2021 Intel Corporation.
+    Copyright Intel Corporation.
     
     This software and the related documents are Intel copyrighted materials, and
     your use of them is governed by the express license under which they were
diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so
new file mode 100755
index 000000000..d7243ada7
Binary files /dev/null and b/deps/mpi/lib/libmpi.so differ
diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12
new file mode 100755
index 000000000..d7243ada7
Binary files /dev/null and b/deps/mpi/lib/libmpi.so.12 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0
new file mode 100755
index 000000000..d7243ada7
Binary files /dev/null and b/deps/mpi/lib/libmpi.so.12.0 differ
diff --git a/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0
similarity index 62%
rename from mpi/lib/libmpi.so.12.0.0
rename to deps/mpi/lib/libmpi.so.12.0.0
index d391200c7..d7243ada7 100755
Binary files a/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpicxx.so b/deps/mpi/lib/libmpicxx.so
new file mode 100755
index 000000000..aeeba2cb7
Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so differ
diff --git a/deps/mpi/lib/libmpicxx.so.12 b/deps/mpi/lib/libmpicxx.so.12
new file mode 100755
index 000000000..aeeba2cb7
Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so.12 differ
diff --git a/deps/mpi/lib/libmpicxx.so.12.0 b/deps/mpi/lib/libmpicxx.so.12.0
new file mode 100755
index 000000000..aeeba2cb7
Binary files /dev/null and b/deps/mpi/lib/libmpicxx.so.12.0 differ
diff --git a/mpi/lib/libmpicxx.so.12.0.0 b/deps/mpi/lib/libmpicxx.so.12.0.0
similarity index 99%
rename from mpi/lib/libmpicxx.so.12.0.0
rename to deps/mpi/lib/libmpicxx.so.12.0.0
index ee69659ef..aeeba2cb7 100755
Binary files a/mpi/lib/libmpicxx.so.12.0.0 and b/deps/mpi/lib/libmpicxx.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so
new file mode 100755
index 000000000..f67aaad45
Binary files /dev/null and b/deps/mpi/lib/libmpifort.so differ
diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12
new file mode 100755
index 000000000..f67aaad45
Binary files /dev/null and b/deps/mpi/lib/libmpifort.so.12 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0
new file mode 100755
index 000000000..f67aaad45
Binary files /dev/null and b/deps/mpi/lib/libmpifort.so.12.0 differ
diff --git a/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0
similarity index 61%
rename from mpi/lib/libmpifort.so.12.0.0
rename to deps/mpi/lib/libmpifort.so.12.0.0
index 6cc0e68cb..f67aaad45 100755
Binary files a/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/mpi/licensing/license.txt b/deps/mpi/licensing/license.txt
old mode 100755
new mode 100644
similarity index 100%
rename from mpi/licensing/license.txt
rename to deps/mpi/licensing/license.txt
diff --git a/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt
old mode 100755
new mode 100644
similarity index 99%
rename from mpi/licensing/third-party-programs.txt
rename to deps/mpi/licensing/third-party-programs.txt
index bd2a5e95b..307780de4
--- a/mpi/licensing/third-party-programs.txt
+++ b/deps/mpi/licensing/third-party-programs.txt
@@ -1,4 +1,4 @@
-Intel(R) MPI Library 2021.2 Third Party Programs File
+Intel(R) MPI Library 2021.3 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
diff --git a/deps/ofi/bin/fi_info b/deps/ofi/bin/fi_info
new file mode 100755
index 000000000..347648e8a
Binary files /dev/null and b/deps/ofi/bin/fi_info differ
diff --git a/ofi/include/rdma/fabric.h b/deps/ofi/include/rdma/fabric.h
similarity index 95%
rename from ofi/include/rdma/fabric.h
rename to deps/ofi/include/rdma/fabric.h
index 79e80e164..71628035e 100644
--- a/ofi/include/rdma/fabric.h
+++ b/deps/ofi/include/rdma/fabric.h
@@ -79,8 +79,8 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 11
-#define FI_REVISION_VERSION 0
+#define FI_MINOR_VERSION 12
+#define FI_REVISION_VERSION 1
 
 enum {
 	FI_PATH_MAX		= 256,
@@ -208,6 +208,7 @@ enum {
 	FI_ADDR_PSMX2,		/* uint64_t[2] */
 	FI_ADDR_IB_UD,		/* uint64_t[4] */
 	FI_ADDR_EFA,
+	FI_ADDR_PSMX3,		/* uint64_t[2] */
 };
 
 #define FI_ADDR_UNSPEC		((uint64_t) -1)
@@ -319,7 +320,8 @@ enum {
 	FI_PROTO_MRAIL,
 	FI_PROTO_RSTREAM,
 	FI_PROTO_RDMA_CM_IB_XRC,
-	FI_PROTO_EFA
+	FI_PROTO_EFA,
+	FI_PROTO_PSMX3
 };
 
 enum {
@@ -598,6 +600,11 @@ struct fi_alias {
 	uint64_t		flags;
 };
 
+struct fi_fid_var {
+	int		name;
+	void		*val;
+};
+
 struct fi_mr_raw_attr {
 	uint64_t	flags;
 	uint64_t	*base_addr;
@@ -632,6 +639,8 @@ enum {
 	FI_REFRESH,		/* mr: fi_mr_modify */
 	FI_DUP,			/* struct fid ** */
 	FI_GETWAITOBJ,		/*enum fi_wait_obj * */
+	FI_GET_VAL,		/* struct fi_fid_var */
+	FI_SET_VAL,		/* struct fi_fid_var */
 };
 
 static inline int fi_control(struct fid *fid, int command, void *arg)
@@ -647,6 +656,28 @@ static inline int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t fla
 	return fi_control(fid, FI_ALIAS, &alias);
 }
 
+/* fid value names */
+/*
+ * Currently no common name is defined. Provider specific names should
+ * have the FI_PROV_SPECIFIC bit set.
+ */
+
+static inline int fi_get_val(struct fid *fid, int name, void *val)
+{
+	struct fi_fid_var var;
+	var.name = name;
+	var.val = val;
+	return fi_control(fid, FI_GET_VAL, &var);
+}
+
+static inline int fi_set_val(struct fid *fid, int name, void *val)
+{
+	struct fi_fid_var var;
+	var.name = name;
+	var.val = val;
+	return fi_control(fid, FI_SET_VAL, &var);
+}
+
 static inline int
 fi_open_ops(struct fid *fid, const char *name, uint64_t flags,
 	    void **ops, void *context)
@@ -692,6 +723,8 @@ enum fi_type {
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);
+char *fi_tostr_r(char *buf, size_t len, const void *data,
+		 enum fi_type datatype);
 
 enum fi_param_type {
 	FI_PARAM_STRING,
diff --git a/ofi/include/rdma/fi_cm.h b/deps/ofi/include/rdma/fi_cm.h
similarity index 100%
rename from ofi/include/rdma/fi_cm.h
rename to deps/ofi/include/rdma/fi_cm.h
diff --git a/ofi/include/rdma/fi_domain.h b/deps/ofi/include/rdma/fi_domain.h
similarity index 100%
rename from ofi/include/rdma/fi_domain.h
rename to deps/ofi/include/rdma/fi_domain.h
diff --git a/ofi/include/rdma/fi_endpoint.h b/deps/ofi/include/rdma/fi_endpoint.h
similarity index 100%
rename from ofi/include/rdma/fi_endpoint.h
rename to deps/ofi/include/rdma/fi_endpoint.h
diff --git a/ofi/include/rdma/fi_eq.h b/deps/ofi/include/rdma/fi_eq.h
similarity index 100%
rename from ofi/include/rdma/fi_eq.h
rename to deps/ofi/include/rdma/fi_eq.h
diff --git a/ofi/include/rdma/fi_errno.h b/deps/ofi/include/rdma/fi_errno.h
similarity index 100%
rename from ofi/include/rdma/fi_errno.h
rename to deps/ofi/include/rdma/fi_errno.h
diff --git a/ofi/include/rdma/fi_rma.h b/deps/ofi/include/rdma/fi_rma.h
similarity index 100%
rename from ofi/include/rdma/fi_rma.h
rename to deps/ofi/include/rdma/fi_rma.h
diff --git a/ofi/include/rdma/fi_tagged.h b/deps/ofi/include/rdma/fi_tagged.h
similarity index 100%
rename from ofi/include/rdma/fi_tagged.h
rename to deps/ofi/include/rdma/fi_tagged.h
diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so
new file mode 100755
index 000000000..35c21dfc3
Binary files /dev/null and b/deps/ofi/lib/libfabric.so differ
diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1
new file mode 100755
index 000000000..35c21dfc3
Binary files /dev/null and b/deps/ofi/lib/libfabric.so.1 differ
diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so
new file mode 100755
index 000000000..47830166d
Binary files /dev/null and b/deps/ofi/lib/prov/libpsm3-fi.so differ
diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so
new file mode 100755
index 000000000..375463c58
Binary files /dev/null and b/deps/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so
new file mode 100755
index 000000000..83af28e2e
Binary files /dev/null and b/deps/ofi/lib/prov/librxm-fi.so differ
diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so
new file mode 100755
index 000000000..dfce33131
Binary files /dev/null and b/deps/ofi/lib/prov/libshm-fi.so differ
diff --git a/deps/ofi/lib/prov/libsockets-fi.so b/deps/ofi/lib/prov/libsockets-fi.so
new file mode 100755
index 000000000..b164233e5
Binary files /dev/null and b/deps/ofi/lib/prov/libsockets-fi.so differ
diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so
new file mode 100755
index 000000000..10f430bc1
Binary files /dev/null and b/deps/ofi/lib/prov/libtcp-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-fi.so b/deps/ofi/lib/prov/libverbs-fi.so
new file mode 100755
index 000000000..2a895fbd5
Binary files /dev/null and b/deps/ofi/lib/prov/libverbs-fi.so differ
diff --git a/doc/requirements.txt b/doc/requirements.txt
index e82b1e325..5c84e63d8 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -41,7 +41,8 @@ requests==2.22.0
 six==1.12.0
 snowballstemmer==1.9.0
 soupsieve==1.9.2
-Sphinx==2.1.2
+Sphinx==3.5.4
+sphinx-book-theme==0.1.0
 sphinx-rtd-theme==0.4.3
 sphinxcontrib-applehelp==1.0.1
 sphinxcontrib-devhelp==1.0.1
diff --git a/doc/rst/Readme.txt b/doc/rst/Readme.txt
old mode 100755
new mode 100644
diff --git a/doc/rst/source/_static/favicons.png b/doc/rst/source/_static/favicons.png
new file mode 100644
index 000000000..f450376b1
Binary files /dev/null and b/doc/rst/source/_static/favicons.png differ
diff --git a/doc/rst/source/_static/oneAPI-rgb-rev-100.png b/doc/rst/source/_static/oneAPI-rgb-rev-100.png
new file mode 100644
index 000000000..58d2d5c54
Binary files /dev/null and b/doc/rst/source/_static/oneAPI-rgb-rev-100.png differ
diff --git a/doc/rst/source/_static/style.css b/doc/rst/source/_static/style.css
old mode 100755
new mode 100644
diff --git a/doc/rst/source/_templates/layout.html b/doc/rst/source/_templates/layout.html
old mode 100755
new mode 100644
diff --git a/doc/rst/source/api/operations/collective-operations/alltoallv.rst b/doc/rst/source/api/operations/collective-operations/alltoallv.rst
index 9161a6cd9..0aae683d5 100644
--- a/doc/rst/source/api/operations/collective-operations/alltoallv.rst
+++ b/doc/rst/source/api/operations/collective-operations/alltoallv.rst
@@ -1,7 +1,7 @@
 Alltoallv 
 *********
 
-.. doxygengroup:: alltoall
+.. doxygengroup:: alltoallv
    :project: oneccl
    :content-only:
-   :no-link:
\ No newline at end of file
+   :no-link:
diff --git a/doc/rst/source/conf.py b/doc/rst/source/conf.py
index dae20b4ad..4c0fbd89f 100755
--- a/doc/rst/source/conf.py
+++ b/doc/rst/source/conf.py
@@ -17,12 +17,12 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'oneCCL Documentation'
-copyright = '2019–2020'
+project = 'oneCCL'
+copyright = '2019–2021'
 author = 'Intel'
 
 # The full version, including alpha/beta/rc tags
-release = '2021'
+# release = '2021'
 
 rst_prolog = """
 .. |product_full| replace:: Intel\ |reg|\  oneAPI Collective Communications Library
@@ -105,19 +105,15 @@
 # Tell sphinx what the pygments highlight language should be.
 highlight_language = 'cpp'
 
-import sphinx_rtd_theme
-html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-if on_rtd:
-    using_rtd_theme = True
+html_theme = 'sphinx_book_theme'
+html_logo = '_static/oneAPI-rgb-rev-100.png'
+html_favicon = '_static/favicons.png'
 
 # Theme options
 html_theme_options = {
-    # 'typekit_id': 'hiw1hhg',
-    # 'analytics_id': '',
-    # 'sticky_navigation': True  # Set to False to disable the sticky nav while scrolling.
-    'logo_only': True,  # if we have a html_logo below, this shows /only/ the logo with no title text
-    'collapse_navigation': False,  # Collapse navigation (False makes it tree-like)
-    # 'display_version': True,  # Display the docs version
-    # 'navigation_depth': 4,  # Depth of the headers shown in the navigation bar
+    'repository_url': 'https://github.com/oneapi-src/oneCCL',
+    'path_to_docs': 'doc/source',
+    'use_issues_button': True,
+    'use_edit_page_button': True,
+    'repository_branch': 'master'
 }
diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst
index 8f83d9f44..988554c9f 100644
--- a/doc/rst/source/env-variables.rst
+++ b/doc/rst/source/env-variables.rst
@@ -556,3 +556,60 @@ CCL_MAX_SHORT_SIZE
 **Description**
 
 Set this environment variable to specify the threshold of the number of bytes for a collective operation to be split.
+
+
+CCL_MNIC
+########
+**Syntax**
+
+::
+
+  CCL_MNIC=<value>
+
+**Arguments**
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``global``
+     - Select all NICs available on the node.
+   * - ``local``
+     - Select all NICs local for the NUMA node that corresponds to process pinning.
+   * - ``none``
+     - Disable special NIC selection, use a single default NIC (**default**).
+
+**Description**
+
+Set this environment variable to control multi-NIC selection policy.
+|product_short| workers will be pinned on selected NICs in a round-robin way.
+
+
+CCL_MNIC_COUNT
+##############
+**Syntax**
+
+::
+
+  CCL_MNIC_COUNT=<value>
+
+**Arguments**
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``N``
+     - The maximum number of NICs that should be selected for |product_short| workers.
+       If not specified then equal to the number of |product_short| workers.
+
+**Description**
+
+Set this environment variable to specify the maximum number of NICs to be selected.
+The actual number of NICs selected may be smaller due to limitations on transport level or system configuration.
diff --git a/doc/rst/source/index.rst b/doc/rst/source/index.rst
old mode 100755
new mode 100644
diff --git a/doc/rst/source/legal.rst b/doc/rst/source/legal.rst
old mode 100755
new mode 100644
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
old mode 100755
new mode 100644
index f5922c224..499d34283
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,6 +17,9 @@ cmake_minimum_required (VERSION 2.8)
 
 if (DEFINED ENV{CCL_CONFIGURATION})
     set(CCL_CONFIGURATION "$ENV{CCL_CONFIGURATION}")
+    if(${CCL_CONFIGURATION} STREQUAL "cpu_gpu_dpcpp")
+        set(COMPUTE_BACKEND_TARGET_NAME "sycl")
+    endif()
 endif()
 
 if (DEFINED ENV{CCL_ROOT})
@@ -25,6 +28,11 @@ else()
     message(FATAL_ERROR "Please define CCL_ROOT environment variable")
 endif()
 
+if (DEFINED ENV{I_MPI_ROOT})
+    set(I_MPI_ROOT "$ENV{I_MPI_ROOT}")
+    set(CMAKE_INSTALL_RPATH "${I_MPI_ROOT}/lib/release_mt/")
+endif()
+
 message(STATUS "CCL_ROOT: ${CCL_ROOT}")
 message(STATUS "CCL_CONFIGURATION: ${CCL_CONFIGURATION}")
 
@@ -58,11 +66,13 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMP
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR(${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM"))
     set(CMAKE_CLANG_FLAGS "-fsycl")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -lsycl")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CLANG_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_FLAGS}")
+    # Use c++17 to be aligned with the compiler
+    set(CMAKE_CXX_STANDARD 17)
 endif()
 
 set(GCC_BF16_MIN_SUPPORTED "4.9.0")
@@ -85,10 +95,6 @@ if (CCL_BF16_COMPILER)
     endif()
 endif()
 
-
-include_directories(${CCL_ROOT}/include/${CCL_CONFIGURATION})
-link_directories(${CCL_ROOT}/lib/${CCL_CONFIGURATION})
-
 include_directories(include)
 
 add_subdirectory(cpu)
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 24ce78b4d..41879eb9e 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -23,16 +23,24 @@ endif()
 include_directories(include)
 include_directories(src)
 
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+find_package(NUMA)
+
 foreach(src ${sources})
     get_filename_component(executable ${src} NAME_WE)
     add_executable(${executable} ${src})
+    if (NUMA_FOUND)
+        target_include_directories(${executable} PRIVATE ${NUMA_INCLUDE_DIR})
+        target_link_libraries(${executable} PRIVATE numa)
+        target_compile_definitions(${executable} PRIVATE CCL_ENABLE_NUMA)
+    endif()
     target_include_directories(${executable} PRIVATE ${EXAMPLES_INC_DIRS})
     target_link_libraries(${executable} PRIVATE ccl)
     target_link_libraries(${executable} PUBLIC pthread)
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    target_link_libraries(${executable} PRIVATE m)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark OPTIONAL)
 endforeach()
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 2ab6646ce..d7c624d5c 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -19,18 +19,19 @@
 #include <chrono>
 #include <cstring>
 #include <getopt.h>
+#include <fstream>
 #include <functional>
 #include <iostream>
 #include <iterator>
+#include <iomanip>
 #include <numeric>
 #include <map>
-#include <math.h>
+#include <cmath>
 #include <numeric>
 #include <stdexcept>
-#include <stdio.h>
+#include <cstdio>
 #include <sys/time.h>
 #include <vector>
-#include <fstream>
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
@@ -44,6 +45,7 @@ using namespace cl::sycl::access;
 #include "coll.hpp"
 #include "sparse_allreduce/sparse_detail.hpp"
 
+/* free letters: f g v z */
 void print_help_usage(const char* app) {
     PRINT("\nUSAGE:\n"
           "\t%s [OPTIONS]\n\n"
@@ -52,30 +54,36 @@ void print_help_usage(const char* app) {
           "\t[-e,--loop <execution loop>]: %s\n"
           "\t[-i,--iters <iteration count>]: %d\n"
           "\t[-w,--warmup_iters <warm up iteration count>]: %d\n"
+          "\t[-j,--iter_policy <iteration policy>]: %s\n"
           "\t[-n,--buf_count <number of parallel operations within single collective>]: %d\n"
           "\t[-f,--min_elem_count <minimum number of elements for single collective>]: %d\n"
           "\t[-t,--max_elem_count <maximum number of elements for single collective>]: %d\n"
           "\t[-y,--elem_counts <list of element counts for single collective>]: [%d-%d]\n"
           "\t[-c,--check <check result correctness>]: %d\n"
           "\t[-p,--cache <use persistent operations>]: %d\n"
+          "\t[-q,--inplace <use same buffer as send and recv buffer>]: %d\n"
+          "\t[-k,--ranks_per_proc <number of ranks per process>]: %d\n"
+#ifdef CCL_ENABLE_NUMA
+          "\t[-s,--numa_node <numa node for allocation of send and recv buffers>]: %s\n"
+#endif /* CCL_ENABLE_NUMA */
 #ifdef CCL_ENABLE_SYCL
           "\t[-a,--sycl_dev_type <sycl device type>]: %s\n"
           "\t[-m,--sycl_mem_type <sycl memory type>]: %s\n"
           "\t[-u,--sycl_usm_type <sycl usm type>]: %s\n"
-#endif
-          "\t[-k,--ranks_per_proc <number of ranks per process>]: %d\n"
+#endif /* CCL_ENABLE_SYCL */
           "\t[-l,--coll <collectives list/all>]: %s\n"
           "\t[-d,--dtype <datatypes list/all>]: %s\n"
           "\t[-r,--reduction <reductions list/all>]: %s\n"
           "\t[-o,--csv_filepath <file to store CSV-formatted data into>]: %s\n"
+          "\t[-x,--ext <show additional information>]\n"
           "\t[-h,--help]\n\n"
-          "example:\n\t--coll allgatherv,allreduce --backend host --loop regular\n"
-          "example:\n\t--coll bcast,reduce --backend sycl --loop unordered \n",
+          "example:\n\t--coll allgatherv,allreduce --backend host --elem_counts 64,1024\n",
           app,
           backend_names[DEFAULT_BACKEND].c_str(),
           loop_names[DEFAULT_LOOP].c_str(),
           DEFAULT_ITERS,
           DEFAULT_WARMUP_ITERS,
+          iter_policy_names[DEFAULT_ITER_POLICY].c_str(),
           DEFAULT_BUF_COUNT,
           DEFAULT_MIN_ELEM_COUNT,
           DEFAULT_MAX_ELEM_COUNT,
@@ -83,12 +91,16 @@ void print_help_usage(const char* app) {
           DEFAULT_MAX_ELEM_COUNT,
           DEFAULT_CHECK_VALUES,
           DEFAULT_CACHE_OPS,
+          DEFAULT_INPLACE,
+          DEFAULT_RANKS_PER_PROC,
+#ifdef CCL_ENABLE_NUMA
+          DEFAULT_NUMA_NODE_STR,
+#endif /* CCL_ENABLE_NUMA */
 #ifdef CCL_ENABLE_SYCL
           sycl_dev_names[DEFAULT_SYCL_DEV_TYPE].c_str(),
           sycl_mem_names[DEFAULT_SYCL_MEM_TYPE].c_str(),
           sycl_usm_names[DEFAULT_SYCL_USM_TYPE].c_str(),
-#endif
-          DEFAULT_RANKS_PER_PROC,
+#endif /* CCL_ENABLE_SYCL */
           DEFAULT_COLL_LIST,
           DEFAULT_DTYPES_LIST,
           DEFAULT_REDUCTIONS_LIST,
@@ -166,6 +178,20 @@ int set_loop(const std::string& option_value, loop_type_t& loop) {
     return 0;
 }
 
+int set_iter_policy(const std::string& option_value, iter_policy_t& policy) {
+    std::string option_name = "iter_policy";
+    std::set<std::string> supported_option_values{ iter_policy_names[ITER_POLICY_OFF],
+                                                   iter_policy_names[ITER_POLICY_AUTO] };
+
+    if (check_supported_options(option_name, option_value, supported_option_values))
+        return -1;
+
+    policy =
+        (option_value == iter_policy_names[ITER_POLICY_OFF]) ? ITER_POLICY_OFF : ITER_POLICY_AUTO;
+
+    return 0;
+}
+
 #ifdef CCL_ENABLE_SYCL
 int set_sycl_dev_type(const std::string& option_value, sycl_dev_type_t& dev) {
     std::string option_name = "sycl_dev_type";
@@ -241,11 +267,8 @@ int set_datatypes(std::string option_value, int check_values, std::list<std::str
                 if ((dt == dtype_names[ccl::datatype::float16] ||
                      dt == dtype_names[ccl::datatype::bfloat16]) &&
                     check_values) {
-                    PRINT(
-                        "correctness checking is not implemented for '%s', try to disable checking with '-c 0' option",
-                        dt.c_str());
+                    PRINT("WARN: correctness checking is not implemented for '%s'", dt.c_str());
                 }
-                return -1;
             }
         }
     }
@@ -277,23 +300,27 @@ int set_reductions(std::string option_value, int check_values, std::list<std::st
         for (auto r : reductions) {
             if (check_supported_options(option_name, r, supported_option_values)) {
                 if ((r != reduction_names[ccl::reduction::sum]) && check_values) {
-                    PRINT(
-                        "correctness checking is not implemented for '%s', try to disable checking with '-c 0' option",
-                        r.c_str());
+                    PRINT("WARN: correctness checking is not implemented for '%s'", r.c_str());
                 }
-                return -1;
             }
         }
     }
     return 0;
 }
 
-size_t get_iter_count(size_t bytes, size_t max_iter_count) {
+size_t get_iter_count(size_t bytes, size_t max_iter_count, iter_policy_t policy) {
     size_t n, res = max_iter_count;
-    n = bytes >> 18;
-    while (n) {
-        res >>= 1;
-        n >>= 1;
+
+    switch (policy) {
+        case ITER_POLICY_OFF: break;
+        case ITER_POLICY_AUTO:
+            n = bytes >> 18;
+            while (n) {
+                res >>= 1;
+                n >>= 1;
+            }
+            break;
+        default: ASSERT(0, "unknown iter_policy %d", policy); break;
     }
 
     if (!res && max_iter_count)
@@ -302,92 +329,136 @@ size_t get_iter_count(size_t bytes, size_t max_iter_count) {
     return res;
 }
 
+void store_to_csv(const user_options_t& options,
+                  size_t nranks,
+                  size_t elem_count,
+                  size_t iter_count,
+                  ccl::datatype dtype,
+                  ccl::reduction op,
+                  double min_time,
+                  double max_time,
+                  double avg_time,
+                  double stddev,
+                  double wait_avg_time) {
+    std::ofstream csvf;
+    csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::app);
+
+    if (csvf.is_open()) {
+        const size_t buf_count = options.buf_count;
+
+        for (const auto& cop : options.coll_names) {
+            auto get_op_name = [&]() {
+                if (cop == "allreduce" || cop == "reduce_scatter" || cop == "reduce") {
+                    return reduction_names.at(op);
+                }
+                return std::string{};
+            };
+
+            csvf << nranks << "," << cop << "," << get_op_name() << "," << dtype_names.at(dtype)
+                 << "," << ccl::get_datatype_size(dtype) << "," << elem_count << "," << buf_count
+                 << "," << iter_count << "," << min_time << "," << max_time << "," << avg_time
+                 << "," << stddev << "," << wait_avg_time << std::endl;
+        }
+        csvf.close();
+    }
+}
+
 /* timer array contains one number per collective, one collective corresponds to ranks_per_proc */
-void print_timings(ccl::communicator& comm,
-                   const std::vector<double>& local_timers,
+void print_timings(const ccl::communicator& comm,
+                   const std::vector<double>& local_total_timers,
+                   const std::vector<double>& local_wait_timers,
                    const user_options_t& options,
-                   const size_t elem_count,
-                   const size_t iter_count,
+                   size_t elem_count,
+                   size_t iter_count,
                    ccl::datatype dtype,
                    ccl::reduction op) {
     const size_t buf_count = options.buf_count;
     const size_t ncolls = options.coll_names.size();
-    std::vector<double> all_timers(ncolls * comm.size());
-    std::vector<size_t> recv_counts(comm.size());
+    const size_t nranks = comm.size();
+
+    // get timers from other ranks
+    std::vector<double> all_ranks_total_timers(ncolls * nranks);
+    std::vector<double> all_ranks_wait_timers(ncolls * nranks);
+    std::vector<size_t> recv_counts(nranks, ncolls);
 
-    int idx;
-    for (idx = 0; idx < comm.size(); idx++)
-        recv_counts[idx] = ncolls;
+    std::vector<ccl::event> events;
+    events.push_back(ccl::allgatherv(
+        local_total_timers.data(), ncolls, all_ranks_total_timers.data(), recv_counts, comm));
+    events.push_back(ccl::allgatherv(
+        local_wait_timers.data(), ncolls, all_ranks_wait_timers.data(), recv_counts, comm));
 
-    ccl::allgatherv(local_timers.data(), ncolls, all_timers.data(), recv_counts, comm).wait();
+    for (ccl::event& ev : events) {
+        ev.wait();
+    }
 
     if (comm.rank() == 0) {
-        std::vector<double> timers(comm.size(), 0);
-        for (int r = 0; r < comm.size(); ++r) {
-            for (size_t c = 0; c < ncolls; ++c) {
-                timers[r] += all_timers[r * ncolls + c];
+        std::vector<double> total_timers(nranks, 0);
+        std::vector<double> wait_timers(nranks, 0);
+        std::vector<double> min_timers(ncolls, 0);
+        std::vector<double> max_timers(ncolls, 0);
+
+        // parse timers from all ranks
+        for (size_t rank_idx = 0; rank_idx < nranks; ++rank_idx) {
+            for (size_t coll_idx = 0; coll_idx < ncolls; ++coll_idx) {
+                double total_time = all_ranks_total_timers.at(rank_idx * ncolls + coll_idx);
+                double wait_time = all_ranks_wait_timers.at(rank_idx * ncolls + coll_idx);
+                total_timers.at(rank_idx) += total_time;
+                wait_timers.at(rank_idx) += wait_time;
+
+                double& min = min_timers.at(coll_idx);
+                min = (min != 0) ? std::min(min, total_time) : total_time;
+
+                double& max = max_timers.at(coll_idx);
+                max = std::max(max, total_time);
             }
         }
 
-        double avg_timer(0);
-        double avg_timer_per_buf(0);
-        for (idx = 0; idx < comm.size(); idx++) {
-            avg_timer += timers[idx];
-        }
-        avg_timer /= (iter_count * comm.size());
-        avg_timer_per_buf = avg_timer / buf_count;
+        double total_avg_time = std::accumulate(total_timers.begin(), total_timers.end(), 0);
+        total_avg_time /= iter_count * nranks;
 
-        double stddev_timer = 0;
-        double sum = 0;
-        for (idx = 0; idx < comm.size(); idx++) {
-            double val = timers[idx] / iter_count;
-            sum += (val - avg_timer) * (val - avg_timer);
-        }
+        double wait_avg_time = std::accumulate(wait_timers.begin(), wait_timers.end(), 0);
+        wait_avg_time /= iter_count * nranks;
 
-        stddev_timer = sqrt(sum / comm.size()) / avg_timer * 100;
-        if (buf_count == 1) {
-            printf("%10zu %12.1lf %11.1lf\n",
-                   elem_count * ccl::get_datatype_size(dtype) * buf_count,
-                   avg_timer,
-                   stddev_timer);
-        }
-        else {
-            printf("%10zu %13.1lf %18.1lf %11.1lf\n",
-                   elem_count * ccl::get_datatype_size(dtype) * buf_count,
-                   avg_timer,
-                   avg_timer_per_buf,
-                   stddev_timer);
+        double sum = 0;
+        for (const double& timer : total_timers) {
+            double latency = (double)timer / iter_count;
+            sum += (latency - total_avg_time) * (latency - total_avg_time);
         }
+        double stddev = std::sqrt((double)sum / nranks) / total_avg_time * 100;
 
-        // in case csv export is requested
-        // we write one line per collop, dtype and reduction
-        // hence average is per collop, not the aggregate over all
-        if (!options.csv_filepath.empty()) {
-            std::ofstream csvf;
-            csvf.open(options.csv_filepath, std::ios::app);
+        double min_time = std::accumulate(min_timers.begin(), min_timers.end(), 0);
+        min_time /= iter_count;
 
-            if (csvf.is_open()) {
-                std::vector<double> avg_timer(ncolls, 0);
+        double max_time = std::accumulate(max_timers.begin(), max_timers.end(), 0);
+        max_time /= iter_count;
 
-                for (int r = 0; r < comm.size(); ++r) {
-                    for (size_t c = 0; c < ncolls; ++c) {
-                        avg_timer[c] += all_timers[r * ncolls + c];
-                    }
-                }
+        size_t bytes = elem_count * ccl::get_datatype_size(dtype) * buf_count;
+        std::stringstream ss;
+        ss << std::right << std::fixed << std::setw(COL_WIDTH) << bytes << std::setw(COL_WIDTH)
+           << iter_count << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << min_time
+           << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << max_time
+           << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << total_avg_time
+           << std::setw(COL_WIDTH - 3) << std::setprecision(COL_PRECISION) << stddev
+           << std::setw(COL_WIDTH + 3);
 
-                for (size_t c = 0; c < ncolls; ++c) {
-                    avg_timer[c] /= (iter_count * comm.size());
-                }
+        if (options.show_additional_info) {
+            ss << std::right << std::fixed << std::setprecision(COL_PRECISION) << wait_avg_time;
+        }
+        ss << std::endl;
+        printf("%s", ss.str().c_str());
 
-                int i = 0;
-                for (auto cop = options.coll_names.begin(); cop != options.coll_names.end();
-                     ++cop, ++i) {
-                    csvf << comm.size() << "," << (*cop) << "," << reduction_names[op] << ","
-                         << dtype_names[dtype] << "," << ccl::get_datatype_size(dtype) << ","
-                         << elem_count << "," << buf_count << "," << avg_timer[i] << std::endl;
-                }
-                csvf.close();
-            }
+        if (!options.csv_filepath.empty()) {
+            store_to_csv(options,
+                         nranks,
+                         elem_count,
+                         iter_count,
+                         dtype,
+                         op,
+                         min_time,
+                         max_time,
+                         total_avg_time,
+                         stddev,
+                         wait_avg_time);
         }
     }
 
@@ -454,39 +525,53 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
     bool should_parse_datatypes = false;
     bool should_parse_reductions = false;
 
+    char short_options[1024] = { 0 };
+
+    const char* base_options = "b:e:i:w:j:n:f:t:c:p:q:o:k:s:l:d:r:y:xh";
+    memcpy(short_options, base_options, strlen(base_options));
+
+#ifdef CCL_ENABLE_NUMA
+    const char* numa_options = "s:";
+    memcpy(short_options + strlen(short_options), numa_options, strlen(numa_options));
+#endif /* CCL_ENABLE_NUMA */
+
 #ifdef CCL_ENABLE_SYCL
-    const char* const short_options = "b:e:i:w:n:f:t:c:p:o:a:m:u:k:l:d:r:y:h";
-#else
-    const char* const short_options = "b:e:i:w:n:f:t:c:p:o:k:l:d:r:y:h";
-#endif
+    const char* sycl_options = "a:m:u:";
+    memcpy(short_options + strlen(short_options), sycl_options, strlen(sycl_options));
+#endif /* CCL_ENABLE_SYCL */
 
     struct option getopt_options[] = {
-        { "backend", required_argument, 0, 'b' },
-        { "loop", required_argument, 0, 'e' },
-        { "iters", required_argument, 0, 'i' },
-        { "warmup_iters", required_argument, 0, 'w' },
-        { "buf_count", required_argument, 0, 'n' },
-        { "min_elem_count", required_argument, 0, 'f' },
-        { "max_elem_count", required_argument, 0, 't' },
-        { "elem_counts", required_argument, 0, 'y' },
-        { "check", required_argument, 0, 'c' },
-        { "cache", required_argument, 0, 'p' },
-    /*{ "v2i_ratio", required_argument, 0, 'v' },*/
+        { "backend", required_argument, nullptr, 'b' },
+        { "loop", required_argument, nullptr, 'e' },
+        { "iters", required_argument, nullptr, 'i' },
+        { "warmup_iters", required_argument, nullptr, 'w' },
+        { "iter_policy", required_argument, nullptr, 'j' },
+        { "buf_count", required_argument, nullptr, 'n' },
+        { "min_elem_count", required_argument, nullptr, 'f' },
+        { "max_elem_count", required_argument, nullptr, 't' },
+        { "elem_counts", required_argument, nullptr, 'y' },
+        { "check", required_argument, nullptr, 'c' },
+        { "cache", required_argument, nullptr, 'p' },
+        { "inplace", required_argument, nullptr, 'q' },
+        { "ranks_per_proc", required_argument, nullptr, 'k' },
+#ifdef CCL_ENABLE_NUMA
+        { "numa_node", required_argument, nullptr, 's' },
+#endif /* CCL_ENABLE_NUMA */
 #ifdef CCL_ENABLE_SYCL
-        { "sycl_dev_type", required_argument, 0, 'a' },
-        { "sycl_mem_type", required_argument, 0, 'm' },
-        { "sycl_usm_type", required_argument, 0, 'u' },
-#endif
-        { "ranks", required_argument, 0, 'k' },
-        { "coll", required_argument, 0, 'l' },
-        { "dtype", required_argument, 0, 'd' },
-        { "reduction", required_argument, 0, 'r' },
-        { "csv_filepath", required_argument, 0, 'o' },
-        { "help", no_argument, 0, 'h' },
-        { 0, 0, 0, 0 } // required at end of array.
+        { "sycl_dev_type", required_argument, nullptr, 'a' },
+        { "sycl_mem_type", required_argument, nullptr, 'm' },
+        { "sycl_usm_type", required_argument, nullptr, 'u' },
+#endif /* CCL_ENABLE_SYCL */
+        { "coll", required_argument, nullptr, 'l' },
+        { "dtype", required_argument, nullptr, 'd' },
+        { "reduction", required_argument, nullptr, 'r' },
+        { "csv_filepath", required_argument, nullptr, 'o' },
+        { "ext", no_argument, nullptr, 'x' },
+        { "help", no_argument, nullptr, 'h' },
+        { nullptr, 0, nullptr, 0 } // required at end of array.
     };
 
-    while ((ch = getopt_long(argc, argv, short_options, getopt_options, NULL)) != -1) {
+    while ((ch = getopt_long(argc, argv, short_options, getopt_options, nullptr)) != -1) {
         switch (ch) {
             case 'b':
                 if (set_backend(optarg, options.backend)) {
@@ -514,6 +599,12 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
                 else
                     errors++;
                 break;
+            case 'j':
+                if (set_iter_policy(optarg, options.iter_policy)) {
+                    PRINT("failed to parse 'iter_policy' option");
+                    errors++;
+                }
+                break;
             case 'n':
                 if (is_valid_integer_option(optarg)) {
                     options.buf_count = atoll(optarg);
@@ -549,10 +640,22 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
                     errors++;
                 break;
             case 'c': options.check_values = atoi(optarg); break;
-            case 'p':
-                options.cache_ops = atoi(optarg);
+            case 'p': options.cache_ops = atoi(optarg); break;
+            case 'q': options.inplace = atoi(optarg); break;
+            case 'k':
+                if (is_valid_integer_option(optarg)) {
+                    options.ranks_per_proc = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 's':
+                if (is_valid_integer_option(optarg)) {
+                    options.numa_node = atoll(optarg);
+                }
+                else
+                    errors++;
                 break;
-                /*case 'v': options.v2i_ratio = atoll(optarg); break;*/
 #ifdef CCL_ENABLE_SYCL
             case 'a':
                 if (set_sycl_dev_type(optarg, options.sycl_dev_type)) {
@@ -572,14 +675,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
                     errors++;
                 }
                 break;
-#endif
-            case 'k':
-                if (is_valid_integer_option(optarg)) {
-                    options.ranks_per_proc = atoll(optarg);
-                }
-                else
-                    errors++;
-                break;
+#endif /* CCL_ENABLE_SYCL */
             case 'l':
                 if (strcmp("all", optarg) == 0) {
                     options.coll_names = tokenize<std::string>(ALL_COLLS_LIST, ',');
@@ -598,6 +694,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
                 should_parse_reductions = true;
                 break;
             case 'o': options.csv_filepath = std::string(optarg); break;
+            case 'x': options.show_additional_info = true; break;
             case 'h': return -1;
             default:
                 PRINT("failed to parse unknown option");
@@ -623,6 +720,21 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
         errors++;
     }
 
+    if (options.inplace) {
+        for (auto name : options.coll_names) {
+            if (name != "allreduce") {
+                PRINT("inplace is not supported for %s yet", name.c_str());
+                errors++;
+                break;
+            }
+        }
+    }
+
+    if (options.coll_names.empty()) {
+        PRINT("empty coll list");
+        errors++;
+    }
+
     if (errors > 0) {
         PRINT("found %d errors while parsing user options", errors);
         for (int idx = 0; idx < argc; idx++) {
@@ -673,6 +785,7 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
 
     std::string backend_str = find_str_val(backend_names, options.backend);
     std::string loop_str = find_str_val(loop_names, options.loop);
+    std::string iter_policy_str = find_str_val(iter_policy_names, options.iter_policy);
 
 #ifdef CCL_ENABLE_SYCL
     std::string sycl_dev_type_str = find_str_val(sycl_dev_names, options.sycl_dev_type);
@@ -687,19 +800,23 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
                   "\n  loop:           %s"
                   "\n  iters:          %zu"
                   "\n  warmup_iters:   %zu"
+                  "\n  iter_policy:    %s"
                   "\n  buf_count:      %zu"
                   "\n  min_elem_count: %zu"
                   "\n  max_elem_count: %zu"
                   "\n  elem_counts:    %s"
                   "\n  check:          %d"
                   "\n  cache:          %d"
-    /*"\n  v2i_ratio:      %zu"*/
+                  "\n  inplace:        %d"
+                  "\n  ranks_per_proc: %zu"
+#ifdef CCL_ENABLE_NUMA
+                  "\n  numa_node:      %s"
+#endif /* CCL_ENABLE_NUMA */
 #ifdef CCL_ENABLE_SYCL
                   "\n  sycl_dev_type:  %s"
                   "\n  sycl_mem_type:  %s"
                   "\n  sycl_usm_type:  %s"
-#endif
-                  "\n  ranks_per_proc: %zu"
+#endif /* CCL_ENABLE_SYCL */
                   "\n  collectives:    %s"
                   "\n  datatypes:      %s"
                   "\n  reductions:     %s"
@@ -709,19 +826,25 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
                   loop_str.c_str(),
                   options.iters,
                   options.warmup_iters,
+                  iter_policy_str.c_str(),
                   options.buf_count,
                   options.min_elem_count,
                   options.max_elem_count,
                   elem_counts_str.c_str(),
                   options.check_values,
                   options.cache_ops,
-    /*options.v2i_ratio,*/
+                  options.inplace,
+                  options.ranks_per_proc,
+#ifdef CCL_ENABLE_NUMA
+                  (options.numa_node == DEFAULT_NUMA_NODE)
+                      ? DEFAULT_NUMA_NODE_STR
+                      : std::to_string(options.numa_node).c_str(),
+#endif /* CCL_ENABLE_NUMA */
 #ifdef CCL_ENABLE_SYCL
                   sycl_dev_type_str.c_str(),
                   sycl_mem_type_str.c_str(),
                   sycl_usm_type_str.c_str(),
-#endif
-                  options.ranks_per_proc,
+#endif /* CCL_ENABLE_SYCL */
                   collectives_str.c_str(),
                   datatypes_str.c_str(),
                   reductions_str.c_str(),
diff --git a/examples/benchmark/include/coll.hpp b/examples/benchmark/include/coll.hpp
index 14da31617..9a8c5d4c8 100644
--- a/examples/benchmark/include/coll.hpp
+++ b/examples/benchmark/include/coll.hpp
@@ -88,12 +88,13 @@ typedef struct bench_exec_attr {
 typedef struct bench_init_attr {
     size_t buf_count;
     size_t max_elem_count;
+    int inplace;
     size_t ranks_per_proc;
+    int numa_node;
 #ifdef CCL_ENABLE_SYCL
     sycl_mem_type_t sycl_mem_type;
     sycl_usm_type_t sycl_usm_type;
 #endif
-    size_t v2i_ratio;
 } bench_init_attr;
 
 /* base polymorph collective wrapper class */
@@ -127,6 +128,10 @@ struct base_coll {
     }
 
     virtual void finalize(size_t elem_count) {
+        auto dtype = get_dtype();
+        if (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16)
+            return;
+
         auto& transport = transport_data::instance();
         auto& comms = transport.get_comms();
         auto streams = transport.get_bench_streams();
@@ -181,6 +186,14 @@ struct base_coll {
         return init_attr.ranks_per_proc;
     }
 
+    int get_inplace() const noexcept {
+        return init_attr.inplace;
+    }
+
+    int get_numa_node() const noexcept {
+        return init_attr.numa_node;
+    }
+
     // first dim - per buf_count, second dim - per local rank
     std::vector<std::vector<void*>> send_bufs;
     std::vector<std::vector<void*>> recv_bufs;
diff --git a/examples/benchmark/include/config.hpp b/examples/benchmark/include/config.hpp
index 5c350b7fc..78794fa8a 100644
--- a/examples/benchmark/include/config.hpp
+++ b/examples/benchmark/include/config.hpp
@@ -15,8 +15,9 @@
 */
 #pragma once
 
-#define ALIGNMENT (4096)
-#define DTYPE     float
+#define REG_MSG_ALIGNMENT   (4096)
+#define LARGE_MSG_ALIGNMENT (2 * 1024 * 1024)
+#define LARGE_MSG_THRESHOLD (1 * 1024 * 1024)
 
 #define ALL_COLLS_LIST "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce,reduce_scatter"
 
@@ -26,6 +27,9 @@
 #define ALL_REDUCTIONS_LIST            "sum,prod,min,max"
 #define ALL_REDUCTIONS_LIST_WITH_CHECK "sum"
 
+#define COL_WIDTH     (14)
+#define COL_PRECISION (2)
+
 #ifdef CCL_ENABLE_SYCL
 #define DEFAULT_BACKEND BACKEND_SYCL
 #else /* CCL_ENABLE_SYCL */
@@ -34,16 +38,19 @@
 #define DEFAULT_LOOP           LOOP_REGULAR
 #define DEFAULT_ITERS          (16)
 #define DEFAULT_WARMUP_ITERS   (16)
-#define DEFAULT_BUF_COUNT      (16)
+#define DEFAULT_ITER_POLICY    ITER_POLICY_AUTO
+#define DEFAULT_BUF_COUNT      (1)
 #define DEFAULT_MIN_ELEM_COUNT (1)
 #define DEFAULT_MAX_ELEM_COUNT (128)
-#define DEFAULT_CHECK_VALUES   (1)
+#define DEFAULT_CHECK_VALUES   (0)
 #define DEFAULT_CACHE_OPS      (1)
-#define DEFAULT_V2I_RATIO      (128)
+#define DEFAULT_INPLACE        (0)
+#define DEFAULT_RANKS_PER_PROC (1)
+#define DEFAULT_NUMA_NODE      (-1)
+#define DEFAULT_NUMA_NODE_STR  "<default>"
 #define DEFAULT_SYCL_DEV_TYPE  SYCL_DEV_GPU
 #define DEFAULT_SYCL_MEM_TYPE  SYCL_MEM_USM
 #define DEFAULT_SYCL_USM_TYPE  SYCL_USM_DEVICE
-#define DEFAULT_RANKS_PER_PROC (1)
 
 #define DEFAULT_COLL_LIST       "allreduce"
 #define DEFAULT_DTYPES_LIST     "float32"
diff --git a/examples/benchmark/include/cpu_coll.hpp b/examples/benchmark/include/cpu_coll.hpp
index 361898101..176ee958e 100644
--- a/examples/benchmark/include/cpu_coll.hpp
+++ b/examples/benchmark/include/cpu_coll.hpp
@@ -15,6 +15,10 @@
 */
 #pragma once
 
+#ifdef CCL_ENABLE_NUMA
+#include <numa.h>
+#endif /* CCL_ENABLE_NUMA */
+
 #include "coll.hpp"
 
 /* cpu-specific base implementation */
@@ -33,14 +37,15 @@ struct cpu_base_coll : base_coll, protected strategy {
 
         for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
             for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-                result = posix_memalign(
-                    (void**)&(send_bufs[idx][rank_idx]),
-                    ALIGNMENT,
-                    base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier);
-                result = posix_memalign(
-                    (void**)&(recv_bufs[idx][rank_idx]),
-                    ALIGNMENT,
-                    base_coll::get_max_elem_count() * sizeof(Dtype) * recv_multiplier);
+                send_bufs[idx][rank_idx] =
+                    alloc_buffer(base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier);
+                if (base_coll::get_inplace()) {
+                    recv_bufs[idx][rank_idx] = send_bufs[idx][rank_idx];
+                }
+                else {
+                    recv_bufs[idx][rank_idx] = alloc_buffer(base_coll::get_max_elem_count() *
+                                                            sizeof(Dtype) * recv_multiplier);
+                }
             }
         }
 
@@ -50,10 +55,16 @@ struct cpu_base_coll : base_coll, protected strategy {
     cpu_base_coll(bench_init_attr init_attr) : cpu_base_coll(init_attr, 1, 1) {}
 
     virtual ~cpu_base_coll() {
+        size_t send_multiplier = coll_strategy::get_send_multiplier();
+        size_t recv_multiplier = coll_strategy::get_recv_multiplier();
         for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
             for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-                free(send_bufs[idx][rank_idx]);
-                free(recv_bufs[idx][rank_idx]);
+                free_buffer(send_bufs[idx][rank_idx],
+                            base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier);
+                if (!base_coll::get_inplace()) {
+                    free_buffer(recv_bufs[idx][rank_idx],
+                                base_coll::get_max_elem_count() * sizeof(Dtype) * recv_multiplier);
+                }
             }
         }
     }
@@ -98,8 +109,60 @@ struct cpu_base_coll : base_coll, protected strategy {
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             memcpy(send_bufs[b_idx][rank_idx], fill_vector.data(), send_bytes);
+            if (!base_coll::get_inplace()) {
+                memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes);
+            }
+        }
+    }
 
-            memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes);
+    void* alloc_buffer(size_t bytes) {
+        void* ptr = nullptr;
+#ifdef CCL_ENABLE_NUMA
+        int numa_node = base_coll::get_numa_node();
+        if (numa_node != DEFAULT_NUMA_NODE) {
+            ASSERT(numa_available() >= 0, "libnuma is not available");
+            ASSERT(numa_node <= numa_max_node(),
+                   "requsted NUMA node %d is larger than max NUMA node %d",
+                   numa_node,
+                   numa_max_node());
+
+            long long free_bytes = 0;
+            numa_node_size64(numa_node, &free_bytes);
+            ASSERT(bytes <= (size_t)free_bytes,
+                   "no enough free memory on NUMA node %d, requested %zu, free %lld",
+                   numa_node,
+                   bytes,
+                   free_bytes);
+
+            ptr = numa_alloc_onnode(bytes, numa_node);
+            ASSERT(
+                ptr, "failed to allocate buffer with size %zu on NUMA node %d", bytes, numa_node);
+        }
+        else
+#endif /* CCL_ENABLE_NUMA */
+        {
+            size_t alignment = REG_MSG_ALIGNMENT;
+            if (bytes >= LARGE_MSG_THRESHOLD)
+                alignment = LARGE_MSG_ALIGNMENT;
+
+            int result = posix_memalign(&ptr, alignment, bytes);
+            ASSERT((result == 0) && ptr, "failed to allocate buffer with size %zu", bytes);
+        }
+
+        return ptr;
+    }
+
+    void free_buffer(void* ptr, size_t bytes) {
+#ifdef CCL_ENABLE_NUMA
+        int numa_node = base_coll::get_numa_node();
+        if (numa_node != DEFAULT_NUMA_NODE) {
+            ASSERT(numa_available() >= 0, "libnuma is not available");
+            numa_free(ptr, bytes);
+        }
+        else
+#endif /* CCL_ENABLE_NUMA */
+        {
+            free(ptr);
         }
     }
 
diff --git a/examples/benchmark/include/sycl_coll.hpp b/examples/benchmark/include/sycl_coll.hpp
index 8cb330edb..064a333eb 100644
--- a/examples/benchmark/include/sycl_coll.hpp
+++ b/examples/benchmark/include/sycl_coll.hpp
@@ -63,8 +63,14 @@ struct sycl_base_coll : base_coll, private strategy {
                 for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
                     send_bufs[idx][rank_idx] = allocator.allocate(
                         base_coll::get_max_elem_count() * send_multiplier, usm_alloc_type);
-                    recv_bufs[idx][rank_idx] = allocator.allocate(
-                        base_coll::get_max_elem_count() * recv_multiplier, usm_alloc_type);
+
+                    if (base_coll::get_inplace()) {
+                        recv_bufs[idx][rank_idx] = send_bufs[idx][rank_idx];
+                    }
+                    else {
+                        recv_bufs[idx][rank_idx] = allocator.allocate(
+                            base_coll::get_max_elem_count() * recv_multiplier, usm_alloc_type);
+                    }
                 }
             }
             else {
@@ -88,7 +94,9 @@ struct sycl_base_coll : base_coll, private strategy {
             if (base_coll::get_sycl_mem_type() == SYCL_MEM_BUF) {
                 for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
                     delete static_cast<sycl_buffer_t<Dtype>*>(send_bufs[idx][rank_idx]);
-                    delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx][rank_idx]);
+                    if (!base_coll::get_inplace()) {
+                        delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx][rank_idx]);
+                    }
                 }
             }
         }
@@ -159,7 +167,9 @@ struct sycl_base_coll : base_coll, private strategy {
                     .memcpy(send_bufs[b_idx][rank_idx], host_send_buf.data(), send_bytes)
                     .wait();
 
-                stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait();
+                if (!base_coll::get_inplace()) {
+                    stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait();
+                }
             }
             else {
                 stream.get_native()
diff --git a/examples/benchmark/include/types.hpp b/examples/benchmark/include/types.hpp
index b8723e3c1..9466d84af 100644
--- a/examples/benchmark/include/types.hpp
+++ b/examples/benchmark/include/types.hpp
@@ -24,16 +24,7 @@
     if (comm.rank() == 0) { \
         printf(fmt "\n", ##__VA_ARGS__); \
     }
-#endif //PRINT_BY_ROOT
-
-#define ASSERT(cond, fmt, ...) \
-    do { \
-        if (!(cond)) { \
-            printf("FAILED\n"); \
-            fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \
-            throw std::runtime_error("ASSERT FAILED"); \
-        } \
-    } while (0)
+#endif /* PRINT_BY_ROOT */
 
 constexpr std::initializer_list<ccl::datatype> all_dtypes = {
     ccl::datatype::int8,    ccl::datatype::int32,   ccl::datatype::int64,   ccl::datatype::uint64,
@@ -42,6 +33,7 @@ constexpr std::initializer_list<ccl::datatype> all_dtypes = {
 
 typedef enum { BACKEND_HOST, BACKEND_SYCL } backend_type_t;
 typedef enum { LOOP_REGULAR, LOOP_UNORDERED } loop_type_t;
+typedef enum { ITER_POLICY_OFF, ITER_POLICY_AUTO } iter_policy_t;
 
 typedef enum { SYCL_DEV_HOST, SYCL_DEV_CPU, SYCL_DEV_GPU } sycl_dev_type_t;
 typedef enum { SYCL_MEM_USM, SYCL_MEM_BUF } sycl_mem_type_t;
@@ -53,6 +45,10 @@ std::map<backend_type_t, std::string> backend_names = { std::make_pair(BACKEND_H
 std::map<loop_type_t, std::string> loop_names = { std::make_pair(LOOP_REGULAR, "regular"),
                                                   std::make_pair(LOOP_UNORDERED, "unordered") };
 
+std::map<iter_policy_t, std::string> iter_policy_names = { std::make_pair(ITER_POLICY_OFF, "off"),
+                                                           std::make_pair(ITER_POLICY_AUTO,
+                                                                          "auto") };
+
 #ifdef CCL_ENABLE_SYCL
 std::map<sycl_dev_type_t, std::string> sycl_dev_names = { std::make_pair(SYCL_DEV_HOST, "host"),
                                                           std::make_pair(SYCL_DEV_CPU, "cpu"),
@@ -114,19 +110,21 @@ typedef struct user_options_t {
     loop_type_t loop;
     size_t iters;
     size_t warmup_iters;
+    iter_policy_t iter_policy;
     size_t buf_count;
     size_t min_elem_count;
     size_t max_elem_count;
     std::list<size_t> elem_counts;
     int check_values;
     int cache_ops;
-    size_t v2i_ratio;
+    int inplace;
+    size_t ranks_per_proc;
+    int numa_node;
 #ifdef CCL_ENABLE_SYCL
     sycl_dev_type_t sycl_dev_type;
     sycl_mem_type_t sycl_mem_type;
     sycl_usm_type_t sycl_usm_type;
 #endif
-    size_t ranks_per_proc;
     std::list<std::string> coll_names;
     std::list<std::string> dtypes;
     std::list<std::string> reductions;
@@ -135,25 +133,28 @@ typedef struct user_options_t {
     bool min_elem_count_set;
     bool max_elem_count_set;
     bool elem_counts_set;
+    bool show_additional_info;
 
     user_options_t() {
         backend = DEFAULT_BACKEND;
         loop = DEFAULT_LOOP;
         iters = DEFAULT_ITERS;
         warmup_iters = DEFAULT_WARMUP_ITERS;
+        iter_policy = DEFAULT_ITER_POLICY;
         buf_count = DEFAULT_BUF_COUNT;
         min_elem_count = DEFAULT_MIN_ELEM_COUNT;
         max_elem_count = DEFAULT_MAX_ELEM_COUNT;
         generate_counts(elem_counts, min_elem_count, max_elem_count);
         check_values = DEFAULT_CHECK_VALUES;
         cache_ops = DEFAULT_CACHE_OPS;
-        v2i_ratio = DEFAULT_V2I_RATIO;
+        inplace = DEFAULT_INPLACE;
+        ranks_per_proc = DEFAULT_RANKS_PER_PROC;
+        numa_node = DEFAULT_NUMA_NODE;
 #ifdef CCL_ENABLE_SYCL
         sycl_dev_type = DEFAULT_SYCL_DEV_TYPE;
         sycl_mem_type = DEFAULT_SYCL_MEM_TYPE;
         sycl_usm_type = DEFAULT_SYCL_USM_TYPE;
 #endif
-        ranks_per_proc = DEFAULT_RANKS_PER_PROC;
         coll_names = tokenize<std::string>(DEFAULT_COLL_LIST, ',');
         dtypes = tokenize<std::string>(DEFAULT_DTYPES_LIST, ',');
         reductions = tokenize<std::string>(DEFAULT_REDUCTIONS_LIST, ',');
@@ -162,6 +163,7 @@ typedef struct user_options_t {
         min_elem_count_set = false;
         max_elem_count_set = false;
         elem_counts_set = false;
+        show_additional_info = false;
     }
 } user_options_t;
 
diff --git a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
index 4c89a1c9c..a0d289aef 100644
--- a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
@@ -37,7 +37,7 @@ struct cpu_allreduce_coll : cpu_base_coll<Dtype, allreduce_strategy_impl> {
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
index 52b5aaf98..400b3e53c 100644
--- a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
@@ -69,7 +69,7 @@ struct sycl_allreduce_coll : sycl_base_coll<Dtype, allreduce_strategy_impl> {
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = host_send_buf[e_idx];
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
index 611e65598..558c87092 100644
--- a/examples/benchmark/src/benchmark.cpp
+++ b/examples/benchmark/src/benchmark.cpp
@@ -78,79 +78,92 @@ void do_regular(ccl::communicator& service_comm,
             PRINT_BY_ROOT(service_comm,
                           "#------------------------------------------------------------\n"
                           "# Benchmarking: %s\n"
-                          "# processes: %d\n"
+                          "# #processes: %d\n"
                           "#------------------------------------------------------------\n",
                           scolls.str().c_str(),
                           service_comm.size());
 
-            if (options.buf_count == 1) {
-                PRINT_BY_ROOT(service_comm, "%10s %12s %11s", "#bytes", "avg[usec]", "stddev[%]");
-            }
-            else {
-                PRINT_BY_ROOT(service_comm,
-                              "%10s %13s %18s %11s",
-                              "#bytes",
-                              "avg[usec]",
-                              "avg_per_buf[usec]",
-                              "stddev[%]");
+            if (service_comm.rank() == 0) {
+                std::stringstream ss;
+                ss << std::right << std::setw(COL_WIDTH) << "#bytes" << std::setw(COL_WIDTH)
+                   << "#repetitions" << std::setw(COL_WIDTH) << "t_min[usec]"
+                   << std::setw(COL_WIDTH) << "t_max[usec]" << std::setw(COL_WIDTH) << "t_avg[usec]"
+                   << std::setw(COL_WIDTH - 3) << "stddev[%]";
+
+                if (options.show_additional_info) {
+                    ss << std::right << std::setw(COL_WIDTH + 3) << "wait_t_avg[usec]";
+                }
+                ss << std::endl;
+                printf("%s", ss.str().c_str());
             }
 
             for (auto& count : options.elem_counts) {
-                size_t iter_count =
-                    get_iter_count(count * ccl::get_datatype_size(dtype), options.iters);
+                size_t iter_count = get_iter_count(
+                    count * ccl::get_datatype_size(dtype), options.iters, options.iter_policy);
 
-                size_t warmup_iter_count =
-                    get_iter_count(count * ccl::get_datatype_size(dtype), options.warmup_iters);
+                size_t warmup_iter_count = get_iter_count(count * ccl::get_datatype_size(dtype),
+                                                          options.warmup_iters,
+                                                          options.iter_policy);
 
                 try {
                     // we store times for each collective separately,
                     // but aggregate over buffers and iterations
-                    std::vector<double> coll_timers(colls.size(), 0);
+                    std::vector<double> total_timers(colls.size(), 0);
+                    std::vector<double> wait_timers(colls.size(), 0);
                     for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
                         auto& coll = colls[coll_idx];
-
-                        double t1 = 0, t2 = 0, t = 0;
-
-                        if (options.check_values) {
-                            coll->prepare(count);
-                        }
+                        double coll_time = 0, wait_time = 0;
 
                         ccl::barrier(service_comm);
 
                         for (size_t iter_idx = 0; iter_idx < (iter_count + warmup_iter_count);
                              iter_idx++) {
-                            t1 = when();
+                            if (options.check_values) {
+                                coll->prepare(count);
+                                ccl::barrier(service_comm);
+                            }
 
+                            double coll_start_time = when();
                             for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
                                 match_id_stream << "coll_" << coll->name() << "_" << coll_idx
-                                                << "_count_" << count << "_buf_" << buf_idx;
+                                                << "_count_" << count << "_buf_" << buf_idx
+                                                << "_dt_" << dtype_name << "_rt_" << reduction;
                                 bench_attr.set<ccl::operation_attr_id::match_id>(
                                     ccl::string_class(match_id_stream.str()));
                                 match_id_stream.str("");
                                 coll->start(count, buf_idx, bench_attr, reqs);
                             }
+                            double coll_end_time = when();
 
+                            double wait_start_time = when();
                             for (auto& req : reqs) {
                                 req.wait();
                             }
+                            double wait_end_time = when();
                             reqs.clear();
 
-                            t2 = when();
-
                             if (iter_idx >= warmup_iter_count) {
-                                t += (t2 - t1);
+                                coll_time += coll_end_time - coll_start_time;
+                                wait_time += wait_end_time - wait_start_time;
                             }
-                        }
 
-                        if (options.check_values) {
-                            coll->finalize(count);
+                            if (options.check_values) {
+                                coll->finalize(count);
+                            }
                         }
 
-                        coll_timers[coll_idx] += t;
+                        total_timers[coll_idx] += coll_time + wait_time;
+                        wait_timers[coll_idx] += wait_time;
                     }
 
-                    print_timings(
-                        service_comm, coll_timers, options, count, iter_count, dtype, reduction_op);
+                    print_timings(service_comm,
+                                  total_timers,
+                                  wait_timers,
+                                  options,
+                                  count,
+                                  iter_count,
+                                  dtype,
+                                  reduction_op);
                 }
                 catch (const std::exception& ex) {
                     ASSERT(0, "error on count %zu, reason: %s", count, ex.what());
@@ -158,8 +171,11 @@ void do_regular(ccl::communicator& service_comm,
             }
         }
     }
+
+    PRINT_BY_ROOT(service_comm, "\n# All done\n");
 }
 
+/* TODO: merge with do_regular */
 void do_unordered(ccl::communicator& service_comm,
                   bench_exec_attr& bench_attr,
                   coll_list_t& all_colls,
@@ -487,14 +503,17 @@ int main(int argc, char* argv[]) {
 
     ccl::communicator& service_comm = transport.get_service_comm();
 
+    print_user_options(options, service_comm);
+
     init_attr.buf_count = options.buf_count;
     init_attr.max_elem_count = options.max_elem_count;
     init_attr.ranks_per_proc = options.ranks_per_proc;
+    init_attr.inplace = options.inplace;
+    init_attr.numa_node = options.numa_node;
 #ifdef CCL_ENABLE_SYCL
     init_attr.sycl_mem_type = options.sycl_mem_type;
     init_attr.sycl_usm_type = options.sycl_usm_type;
-#endif
-    init_attr.v2i_ratio = options.v2i_ratio;
+#endif /* CCL_ENABLE_SYCL */
 
     try {
         create_all_colls(init_attr, options, colls);
@@ -510,14 +529,6 @@ int main(int argc, char* argv[]) {
     bench_exec_attr bench_attr{};
     bench_attr.init_all();
 
-    print_user_options(options, service_comm);
-
-    if (options.coll_names.empty()) {
-        PRINT_BY_ROOT(service_comm, "empty coll list");
-        print_help_usage(argv[0]);
-        return -1;
-    }
-
     ccl::barrier(service_comm);
 
     switch (options.loop) {
@@ -525,15 +536,26 @@ int main(int argc, char* argv[]) {
             // open and truncate CSV file if csv-output is requested
             if (service_comm.rank() == 0 && !options.csv_filepath.empty()) {
                 std::ofstream csvf;
-                csvf.open(options.csv_filepath, std::ios::trunc);
+                csvf.open(options.csv_filepath, std::ofstream::out | std::ofstream::trunc);
                 if (!csvf.is_open()) {
                     std::cerr << "Cannot open CSV file for writing: " << options.csv_filepath
                               << std::endl;
-                    return -1;
+                    abort();
                 }
                 // write header (column names)
-                csvf << "#ranks,collective,reduction,type,typesize,#elements/buffer,#buffers,time"
-                     << std::endl;
+                csvf << "#ranks,"
+                     << "collective,"
+                     << "reduction,"
+                     << "dtype,"
+                     << "dtype_size,"
+                     << "#elements/buffer,"
+                     << "#buffers,"
+                     << "#repetitions,"
+                     << "t_min[usec],"
+                     << "t_max[usec],"
+                     << "t_avg[usec],"
+                     << "stddev[%],"
+                     << "wait_t_avg[usec]" << std::endl;
                 csvf.close();
             }
             ccl::barrier(service_comm);
@@ -549,6 +571,7 @@ int main(int argc, char* argv[]) {
         default: ASSERT(0, "unknown loop %d", options.loop); break;
     }
 
+    colls.clear();
     transport.reset_comms();
 
     return 0;
diff --git a/examples/benchmark/src/declarations.hpp b/examples/benchmark/src/declarations.hpp
old mode 100755
new mode 100644
diff --git a/examples/common/CMakeLists.txt b/examples/common/CMakeLists.txt
index ee14b6fc3..c95d44bc6 100644
--- a/examples/common/CMakeLists.txt
+++ b/examples/common/CMakeLists.txt
@@ -25,6 +25,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common OPTIONAL)
 endforeach()
diff --git a/examples/cpu/CMakeLists.txt b/examples/cpu/CMakeLists.txt
index 58099643c..403a409d4 100644
--- a/examples/cpu/CMakeLists.txt
+++ b/examples/cpu/CMakeLists.txt
@@ -25,7 +25,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC dl)
     target_link_libraries(${executable} PUBLIC pthread)
     target_link_libraries(${executable} PUBLIC stdc++)
-    target_link_libraries(${executable} PRIVATE m)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu OPTIONAL)
 endforeach()
diff --git a/examples/cpu/allgatherv.cpp b/examples/cpu/allgatherv.cpp
index 4c1d0fa44..d34604e26 100644
--- a/examples/cpu/allgatherv.cpp
+++ b/examples/cpu/allgatherv.cpp
@@ -132,8 +132,12 @@ int main() {
              run_collective_vector(
                  "warmup_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
 
+             ccl::string_class regular_match_id = std::to_string(msg_count);
+             ccl::string_class vector_match_id = regular_match_id + std::string("_vector");
+             attr.set<ccl::operation_attr_id::match_id>(regular_match_id);
              attr.set<ccl::operation_attr_id::to_cache>(true);
              run_collective("persistent_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
+             attr.set<ccl::operation_attr_id::match_id>(vector_match_id);
              run_collective_vector(
                  "persistent_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
 
diff --git a/examples/cpu/custom_allreduce.cpp b/examples/cpu/custom_allreduce.cpp
index 970730dc6..54a24ed4a 100644
--- a/examples/cpu/custom_allreduce.cpp
+++ b/examples/cpu/custom_allreduce.cpp
@@ -31,8 +31,8 @@ typedef void (*fill_fn_t)(void*, size_t, size_t);
 typedef int (*check_fn_t)(void*, size_t, expected_fn_t);
 
 #define RUN_COLLECTIVE(start_cmd, fill_fn, check_fn, expected_fn, name) \
-    t = 0; \
     do { \
+        double t1 = 0, t2 = 0, t = 0; \
         for (int iter_idx = 0; iter_idx < ITERS; iter_idx++) { \
             global_match_id = match_id; \
             fill_fn(send_buf, MSG_SIZE_COUNT, rank + 1); \
diff --git a/examples/external_launcher/run.sh b/examples/external_launcher/run.sh
index aae636f69..797cb4ecd 100755
--- a/examples/external_launcher/run.sh
+++ b/examples/external_launcher/run.sh
@@ -16,7 +16,8 @@
 #
 
 BASENAME=`basename $0 .sh`
-TIMEOUT=600
+
+cmd_timeout=600
 
 echo_log()
 {
@@ -142,6 +143,20 @@ parse_arguments()
     echo_log "-----------------------------------------------------------"
 }
 
+run_cmd()
+{
+    host="$1"
+    cmd="$2"
+    timeout_prefix="$3"
+
+    if [[ "${host}" == "localhost" ]]
+    then
+        eval ${timeout_prefix} $cmd&
+    else
+        ${timeout_prefix} ssh ${host} $cmd&
+    fi
+}
+
 cleanup_hosts()
 {
     hostlist=$1
@@ -151,7 +166,7 @@ cleanup_hosts()
     do
         echo "host ${host}"
         cmd="killall -9 external_launcher run_binary.sh"
-        ssh ${host} $cmd
+        run_cmd ${host} "${cmd}"
     done
 }
 
@@ -180,7 +195,8 @@ run_binary()
         fi
     elif [ "$kvs_mode" == "ip_port" ]
     then
-        kvs_param=`ssh ${hostlist[0]} hostname -I | awk '{print $1}'`
+        cmd="hostname -I | sed -e 's/\s.*$//'"
+        kvs_param=`run_cmd ${hostlist[0]} "${cmd}"`
     fi
 
     host_idx=0
@@ -203,7 +219,8 @@ run_binary()
                 cmd="${cmd} -mv ${I_MPI_ROOT}/env/vars.sh"
             fi
 
-            timeout -k $((TIMEOUT))s $((TIMEOUT))s ssh ${host} $cmd&
+            timeout_prefix="timeout -k $((cmd_timeout))s $((cmd_timeout))s"
+            run_cmd ${host} "${cmd}" "${timeout_prefix}"
         done
         host_idx=$((host_idx + 1))
     done
@@ -253,9 +270,9 @@ run()
         run_binary $mode
 
         exec_time="$((`date +%s`-$exec_time))"
-        if [ "$exec_time" -ge "$TIMEOUT" ];
+        if [ "$exec_time" -ge "$cmd_timeout" ];
         then
-             echo -e "${RED}FAILED: Timeout ($exec_time > $TIMEOUT)${NC}"
+             echo -e "${RED}FAILED: Timeout ($exec_time > $cmd_timeout)${NC}"
              exit 1
         fi
     done
diff --git a/examples/include/base.hpp b/examples/include/base.hpp
index c507fcbc2..2421fb309 100644
--- a/examples/include/base.hpp
+++ b/examples/include/base.hpp
@@ -26,8 +26,10 @@
 #include <mpi.h>
 #include <stdexcept>
 #include <stdio.h>
+#include <sys/syscall.h>
 #include <sys/time.h>
 #include <vector>
+#include <unistd.h>
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
@@ -35,6 +37,8 @@ using namespace cl::sycl;
 using namespace cl::sycl::access;
 #endif /* CCL_ENABLE_SYCL */
 
+#define GETTID() syscall(SYS_gettid)
+
 #define ITERS                (16)
 #define COLL_ROOT            (0)
 #define MSG_SIZE_COUNT       (6)
@@ -51,7 +55,15 @@ using namespace cl::sycl::access;
     do { \
         if (!(cond)) { \
             printf("FAILED\n"); \
-            fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \
+            fprintf(stderr, \
+                    "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \
+                    GETTID(), \
+                    __FILE__, \
+                    __FUNCTION__, \
+                    __LINE__, \
+                    #cond, \
+                    ##__VA_ARGS__); \
+            fflush(stderr); \
             throw std::runtime_error("ASSERT FAILED"); \
         } \
     } while (0)
@@ -93,27 +105,13 @@ using namespace cl::sycl::access;
         PRINT_BY_ROOT(comm, "PASSED"); \
     } while (0)
 
-double t1, t2, t;
-
-double when(void) {
-    struct timeval tv;
-    static struct timeval tv_base;
-    static int is_first = 1;
-
-    if (gettimeofday(&tv, NULL)) {
-        perror("gettimeofday");
-        return 0;
-    }
-
-    if (is_first) {
-        tv_base = tv;
-        is_first = 0;
-    }
-
-    return (double)(tv.tv_sec - tv_base.tv_sec) * 1.0e6 + (double)(tv.tv_usec - tv_base.tv_usec);
+inline double when(void) {
+    auto time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration<double, std::micro>(time.time_since_epoch());
+    return duration.count();
 }
 
-void mpi_finalize() {
+inline void mpi_finalize() {
     int is_finalized = 0;
     MPI_Finalized(&is_finalized);
 
diff --git a/examples/include/base_utils.hpp b/examples/include/base_utils.hpp
index b62566f53..5dd68a1dc 100644
--- a/examples/include/base_utils.hpp
+++ b/examples/include/base_utils.hpp
@@ -112,7 +112,7 @@ void ccl_tuple_for_each_indexed(functor f, const FunctionArgs&... args) {
 namespace utils {
 
 template <typename T>
-void str_to_array(const char* input, std::vector<T>& output, char delimiter) {
+inline void str_to_array(const char* input, std::vector<T>& output, char delimiter) {
     if (!input) {
         return;
     }
@@ -126,7 +126,7 @@ void str_to_array(const char* input, std::vector<T>& output, char delimiter) {
     }
 }
 template <>
-void str_to_array(const char* input, std::vector<std::string>& output, char delimiter) {
+inline void str_to_array(const char* input, std::vector<std::string>& output, char delimiter) {
     std::string processes_input(input);
 
     processes_input.erase(std::remove_if(processes_input.begin(),
diff --git a/examples/include/bf16.hpp b/examples/include/bf16.hpp
index 778b491b1..72ab78955 100644
--- a/examples/include/bf16.hpp
+++ b/examples/include/bf16.hpp
@@ -87,7 +87,7 @@ void convert_fp32_to_bf16(const void* src, void* dst) __attribute__((target("avx
 void convert_fp32_to_bf16(const void* src, void* dst) {
 #ifdef CCL_BF16_AVX512BF_COMPILER
     if (is_avx512bf_enabled()) {
-        _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
+        _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
     }
     else
 #endif
diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp
index 5fac895d9..c7861d2a7 100644
--- a/examples/include/sycl_base.hpp
+++ b/examples/include/sycl_base.hpp
@@ -73,7 +73,7 @@ inline bool check_sycl_usm(queue& q, usm::alloc alloc_type) {
     return ret;
 }
 
-std::string get_preferred_gpu_platform_name() {
+inline std::string get_preferred_gpu_platform_name() {
     std::string filter;
     std::string result;
 
@@ -130,7 +130,7 @@ std::string get_preferred_gpu_platform_name() {
     return result;
 }
 
-std::vector<sycl::device> create_sycl_gpu_devices() {
+inline std::vector<sycl::device> create_sycl_gpu_devices() {
     constexpr char dev_prefix[] = "-- ";
     constexpr char sub_dev_prefix[] = "---- ";
 
@@ -138,7 +138,8 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
     auto plaform_list = sycl::platform::get_platforms();
     auto preferred_platform_name = get_preferred_gpu_platform_name();
 
-    cout << "preferred platform: [" << preferred_platform_name << "]\n";
+    std::stringstream ss;
+    ss << "preferred platform: [" << preferred_platform_name << "]\n";
 
     for (const auto& platform : plaform_list) {
         auto platform_name = platform.get_info<sycl::info::platform::name>();
@@ -146,7 +147,7 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
         if (platform_name.compare(preferred_platform_name) != 0)
             continue;
 
-        cout << "platform: [" << platform_name << "]\n";
+        ss << "platform: [" << platform_name << "]\n";
 
         auto device_list = platform.get_devices();
 
@@ -154,7 +155,7 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
             auto device_name = device.get_info<cl::sycl::info::device::name>();
 
             if (!device.is_gpu()) {
-                cout << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n";
+                ss << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n";
                 continue;
             }
 
@@ -164,9 +165,9 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
                           part_props.end(),
                           info::partition_property::partition_by_affinity_domain) ==
                 part_props.end()) {
-                cout << dev_prefix << "device [" << device_name
-                     << "] does not support partition by affinity domain"
-                     << ", use root device\n";
+                ss << dev_prefix << "device [" << device_name
+                   << "] does not support partition by affinity domain"
+                   << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
@@ -178,16 +179,16 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
                           part_affinity_domains.end(),
                           info::partition_affinity_domain::next_partitionable) ==
                 part_affinity_domains.end()) {
-                cout << dev_prefix << "device [" << device_name
-                     << "] does not support next_partitionable affinity domain"
-                     << ", use root device\n";
+                ss << dev_prefix << "device [" << device_name
+                   << "] does not support next_partitionable affinity domain"
+                   << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
 
-            cout << dev_prefix << "device [" << device_name << "] should provide "
-                 << device.template get_info<info::device::partition_max_sub_devices>()
-                 << " sub-devices\n";
+            ss << dev_prefix << "device [" << device_name << "] should provide "
+               << device.template get_info<info::device::partition_max_sub_devices>()
+               << " sub-devices\n";
 
             auto sub_devices =
                 device.create_sub_devices<info::partition_property::partition_by_affinity_domain>(
@@ -195,19 +196,19 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
 
             if (sub_devices.empty()) {
                 /* TODO: remove when SYCL/L0 sub-devices will be supported */
-                cout << dev_prefix << "device [" << device_name << "] does not provide sub-devices"
-                     << ", use root device\n";
+                ss << dev_prefix << "device [" << device_name << "] does not provide sub-devices"
+                   << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
 
-            cout << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size()
-                 << " sub-devices\n";
+            ss << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size()
+               << " sub-devices\n";
             result.insert(result.end(), sub_devices.begin(), sub_devices.end());
 
             for (auto idx = 0; idx < sub_devices.size(); idx++) {
-                cout << sub_dev_prefix << "sub-device " << idx << ": ["
-                     << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n";
+                ss << sub_dev_prefix << "sub-device " << idx << ": ["
+                   << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n";
             }
         }
     }
@@ -216,13 +217,14 @@ std::vector<sycl::device> create_sycl_gpu_devices() {
         throw std::runtime_error("no GPU devices found");
     }
 
-    cout << "found: " << result.size() << " GPU device(s)\n";
+    ss << "found: " << result.size() << " GPU device(s)\n";
+    printf("%s", ss.str().c_str());
 
     return result;
 }
 
-std::vector<sycl::queue> create_sycl_queues(const std::string& device_type,
-                                            const std::vector<int>& ranks) {
+inline std::vector<sycl::queue> create_sycl_queues(const std::string& device_type,
+                                                   const std::vector<int>& ranks) {
     std::vector<sycl::device> devices;
 
     try {
@@ -338,7 +340,7 @@ inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) {
     }
 }
 
-bool handle_exception(queue& q) {
+inline bool handle_exception(queue& q) {
     try {
         q.wait_and_throw();
     }
@@ -349,7 +351,7 @@ bool handle_exception(queue& q) {
     return true;
 }
 
-usm::alloc usm_alloc_type_from_string(const string& str) {
+inline usm::alloc usm_alloc_type_from_string(const string& str) {
     const map<string, usm::alloc> names{ {
         { "host", usm::alloc::host },
         { "device", usm::alloc::device },
@@ -368,7 +370,7 @@ usm::alloc usm_alloc_type_from_string(const string& str) {
     return it->second;
 }
 
-std::pair<usm::alloc, std::string> take_usm_type(const int argc, char* str_type) {
+inline std::pair<usm::alloc, std::string> take_usm_type(const int argc, char* str_type) {
     std::map<usm::alloc, std::string> map_usm_type;
     auto usm_alloc_type = usm::alloc::shared;
     auto str_usm_alloc_type = "shared";
@@ -404,7 +406,11 @@ struct buf_allocator {
         else if (alloc_type == usm::alloc::shared)
             ptr = aligned_alloc_shared<T>(alignment, count, q);
         else
-            throw std::runtime_error(string(__PRETTY_FUNCTION__) + "unexpected alloc_type");
+            throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - unexpected alloc_type");
+
+        if (!ptr) {
+            throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - failed to allocate buffer");
+        }
 
         auto it = memory_storage.find(ptr);
         if (it != memory_storage.end()) {
@@ -415,9 +421,10 @@ struct buf_allocator {
 
         auto pointer_type = sycl::get_pointer_type(ptr, q.get_context());
         if (pointer_type != alloc_type)
-            throw std::runtime_error(
-                string(__PRETTY_FUNCTION__) + "pointer_type " + std::to_string((int)pointer_type) +
-                " doesn't match with requested " + std::to_string((int)alloc_type));
+            throw std::runtime_error(string(__PRETTY_FUNCTION__) + " - pointer_type " +
+                                     std::to_string((int)pointer_type) +
+                                     " doesn't match with requested " +
+                                     std::to_string((int)alloc_type));
 
         return ptr;
     }
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index e336d8be1..4bff71065 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -25,7 +25,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PRIVATE ccl)
-    target_link_libraries(${executable} PRIVATE m)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
     target_link_libraries(${executable} PUBLIC mpi)
     target_link_libraries(${executable} PRIVATE ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl OPTIONAL)
diff --git a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
index 9380a064f..935c6f1c0 100644
--- a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
@@ -98,24 +98,14 @@ int main(int argc, char *argv[]) {
         });
     });
 
-    /* create dependency vector */
-    vector<ccl::event> events;
-    // events.push_back(ccl::create_event(e));
-
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke allgatherv */
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
-    ccl::allgatherv(send_buf,
-                    send_count,
-                    recv_buf,
-                    recv_counts,
-                    ccl::datatype::int32,
-                    comm,
-                    stream,
-                    attr,
-                    events)
+    ccl::allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, ccl::datatype::int32, comm, stream, attr, deps)
         .wait();
 
     /* open recv_buf and check its correctness on the device side */
diff --git a/examples/sycl/sycl_allgatherv_test.cpp b/examples/sycl/sycl_allgatherv_test.cpp
index 7ac3a48b0..3b2c9f764 100644
--- a/examples/sycl/sycl_allgatherv_test.cpp
+++ b/examples/sycl/sycl_allgatherv_test.cpp
@@ -87,23 +87,18 @@ int main(int argc, char *argv[]) {
     }
 
     /* open send_buf and modify it on the device side */
-    auto e = q.submit([&](auto &h) {
+    q.submit([&](auto &h) {
         accessor send_buf_acc(send_buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
             send_buf_acc[id] += 1;
         });
     });
 
-    /* create dependency vector */
-    vector<ccl::event> events;
-    //events.push_back(ccl::create_event(e));
-
     if (!handle_exception(q))
         return -1;
 
     /* invoke allagtherv */
-    auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
-    ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, events).wait();
+    ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream).wait();
 
     /* open recv_buf and check its correctness on the device side */
     q.submit([&](auto &h) {
diff --git a/examples/sycl/sycl_allgatherv_usm_test.cpp b/examples/sycl/sycl_allgatherv_usm_test.cpp
index a6013485a..3e95c3d1c 100644
--- a/examples/sycl/sycl_allgatherv_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_usm_test.cpp
@@ -90,16 +90,13 @@ int main(int argc, char *argv[]) {
         });
     });
 
-    /* create dependency vector */
-    vector<ccl::event> events;
-    // events.push_back(ccl::create_event(e));
-
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke allagtherv */
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
-    ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, events).wait();
+    ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait();
 
     /* open recv_buf and check its correctness on the device side */
     q.submit([&](auto &h) {
diff --git a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
index 4c0605ba2..55bfd3fd2 100644
--- a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
@@ -80,11 +80,13 @@ int main(int argc, char *argv[]) {
         });
     });
 
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke allreduce */
-    ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream).wait();
+    auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
+    ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count);
diff --git a/examples/sycl/sycl_allreduce_test.cpp b/examples/sycl/sycl_allreduce_test.cpp
index 6200b3c33..1f94bfbdb 100644
--- a/examples/sycl/sycl_allreduce_test.cpp
+++ b/examples/sycl/sycl_allreduce_test.cpp
@@ -81,8 +81,7 @@ int main(int argc, char *argv[]) {
         });
     });
 
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel, dependency will be resolved by sycl::buffer */
 
     /* invoke allreduce */
     ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream).wait();
diff --git a/examples/sycl/sycl_allreduce_usm_test.cpp b/examples/sycl/sycl_allreduce_usm_test.cpp
index e2fceb44a..5065a3d39 100644
--- a/examples/sycl/sycl_allreduce_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_usm_test.cpp
@@ -82,11 +82,13 @@ int main(int argc, char *argv[]) {
         });
     });
 
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke allreduce */
-    ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream).wait();
+    auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
+    ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count);
diff --git a/examples/sycl/sycl_alltoall_usm_test.cpp b/examples/sycl/sycl_alltoall_usm_test.cpp
index 8fa744a97..ecb75538f 100644
--- a/examples/sycl/sycl_alltoall_usm_test.cpp
+++ b/examples/sycl/sycl_alltoall_usm_test.cpp
@@ -75,18 +75,20 @@ int main(int argc, char *argv[]) {
     auto recv_buf = allocator.allocate(count * size, usm_alloc_type);
 
     /* open buffers and modify them on the device side */
-    q.submit([&](auto &h) {
+    auto e = q.submit([&](auto &h) {
         h.parallel_for(count * size, [=](auto id) {
             send_buf[id] = id / count + 1;
             recv_buf[id] = -1;
         });
     });
 
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke alltoall */
-    ccl::alltoall(send_buf, recv_buf, count, comm, stream).wait();
+    auto attr = ccl::create_operation_attr<ccl::alltoall_attr>();
+    ccl::alltoall(send_buf, recv_buf, count, comm, stream, attr, deps).wait();
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count * size);
diff --git a/examples/sycl/sycl_alltoallv_usm_test.cpp b/examples/sycl/sycl_alltoallv_usm_test.cpp
index 5f23ad973..36b2b2d1d 100644
--- a/examples/sycl/sycl_alltoallv_usm_test.cpp
+++ b/examples/sycl/sycl_alltoallv_usm_test.cpp
@@ -78,18 +78,20 @@ int main(int argc, char *argv[]) {
     vector<size_t> recv_counts(size, count);
 
     /* open buffers and modify them on the device side */
-    q.submit([&](auto &h) {
+    auto e = q.submit([&](auto &h) {
         h.parallel_for(count * size, [=](auto id) {
             send_buf[id] = id / count + 1;
             recv_buf[id] = -1;
         });
     });
 
-    if (!handle_exception(q))
-        return -1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
 
     /* invoke alltoall */
-    ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream).wait();
+    auto attr = ccl::create_operation_attr<ccl::alltoallv_attr>();
+    ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream, attr, deps).wait();
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count * size);
diff --git a/examples/sycl/sycl_broadcast_test.cpp b/examples/sycl/sycl_broadcast_test.cpp
index 1976afdd5..d03d6e33d 100644
--- a/examples/sycl/sycl_broadcast_test.cpp
+++ b/examples/sycl/sycl_broadcast_test.cpp
@@ -63,24 +63,21 @@ int main(int argc, char *argv[]) {
     /* create buffers */
     buffer<int> buf(count);
 
-    {
+    if (rank == root_rank) {
         /* open buf and initialize it on the host side */
         host_accessor send_buf_acc(buf, write_only);
         for (i = 0; i < count; i++) {
-            if (rank == root_rank)
-                send_buf_acc[i] = rank + 10;
-            else
-                send_buf_acc[i] = 0;
+            send_buf_acc[i] = 10;
         }
-    }
 
-    /* open buf and modify it on the device side */
-    q.submit([&](auto &h) {
-        accessor send_buf_acc(buf, h, write_only);
-        h.parallel_for(count, [=](auto id) {
-            send_buf_acc[id] += 1;
+        /* open buf and modify it on the device side */
+        q.submit([&](auto &h) {
+            accessor send_buf_acc(buf, h, write_only);
+            h.parallel_for(count, [=](auto id) {
+                send_buf_acc[id] += 1;
+            });
         });
-    });
+    }
 
     if (!handle_exception(q))
         return -1;
@@ -92,7 +89,7 @@ int main(int argc, char *argv[]) {
     q.submit([&](auto &h) {
         accessor recv_buf_acc(buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
-            if (recv_buf_acc[id] != root_rank + 11) {
+            if (recv_buf_acc[id] != 11) {
                 recv_buf_acc[id] = -1;
             }
         });
diff --git a/examples/sycl/sycl_broadcast_usm_test.cpp b/examples/sycl/sycl_broadcast_usm_test.cpp
index 78b95af82..1f47abfc8 100644
--- a/examples/sycl/sycl_broadcast_usm_test.cpp
+++ b/examples/sycl/sycl_broadcast_usm_test.cpp
@@ -74,31 +74,29 @@ int main(int argc, char *argv[]) {
     /* create buffers */
     auto buf = allocator.allocate(count, usm_alloc_type);
 
-    /* open buffers and modify them on the device side */
-    q.submit([&](auto &h) {
-        h.parallel_for(count, [=](auto id) {
-            if (rank == root_rank) {
-                buf[id] = root_rank + 10;
-            }
-            else {
-                buf[id] = 0;
-            }
-            buf[id] += 1;
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+
+    if (rank == root_rank) {
+        /* open buffers and modify them on the device side */
+        auto e = q.submit([&](auto &h) {
+            h.parallel_for(count, [=](auto id) {
+                buf[id] = 10;
+            });
         });
-    });
-
-    if (!handle_exception(q))
-        return -1;
+        deps.push_back(ccl::create_event(e));
+    }
 
     /* invoke broadcast */
-    ccl::broadcast(buf, count, root_rank, comm, stream).wait();
+    auto attr = ccl::create_operation_attr<ccl::broadcast_attr>();
+    ccl::broadcast(buf, count, root_rank, comm, stream, attr, deps).wait();
 
     /* open buf and check its correctness on the device side */
     buffer<int> check_buf(count * size);
     q.submit([&](auto &h) {
         accessor check_buf_acc(check_buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
-            if (buf[id] != root_rank + 11) {
+            if (buf[id] != 10) {
                 check_buf_acc[id] = -1;
             }
         });
diff --git a/include/oneapi/ccl/communicator.hpp b/include/oneapi/ccl/communicator.hpp
index 194403d2c..b2046f235 100644
--- a/include/oneapi/ccl/communicator.hpp
+++ b/include/oneapi/ccl/communicator.hpp
@@ -102,7 +102,7 @@ class communicator final : public ccl_api_base_movable<communicator,
     template <class... attr_val_type>
     stream create_stream(attr_val_type&&... avs) {
         // return stream::create_stream_from_attr(get_device(), get_context(), std::forward<attr_val_type>(avs)...);
-        throw;
+        throw ccl::unsupported("API", "create_stream");
     }
 
     communicator split(const comm_split_attr& attr);
diff --git a/include/oneapi/ccl/config.h.in b/include/oneapi/ccl/config.h.in
index 3980df027..1f2ad4e5f 100644
--- a/include/oneapi/ccl/config.h.in
+++ b/include/oneapi/ccl/config.h.in
@@ -46,6 +46,3 @@
 
 /* Auto-generated configuration settings for multi GPU support*/
 #cmakedefine MULTI_GPU_SUPPORT
-
-/* Configuration setting for truncate v/s RNE rounding mode */
-#cmakedefine CCL_GPU_BF16_TRUNCATE
diff --git a/include/oneapi/ccl/environment.hpp b/include/oneapi/ccl/environment.hpp
index b087bffcc..3956dff89 100644
--- a/include/oneapi/ccl/environment.hpp
+++ b/include/oneapi/ccl/environment.hpp
@@ -179,12 +179,6 @@ class environment {
         return event::create_from_native(native_event);
     }
 
-    template <class event_handle_type,
-              class = typename std::enable_if<is_event_supported<event_handle_type>()>::type>
-    event create_event(event_handle_type& native_event_handle, event::context_t& context) {
-        return event::create_from_native(native_event_handle, context);
-    }
-
     /******************** STREAM ********************/
 
     template <class native_stream_type,
diff --git a/include/oneapi/ccl/exception.hpp b/include/oneapi/ccl/exception.hpp
index a5d03b400..6de627142 100644
--- a/include/oneapi/ccl/exception.hpp
+++ b/include/oneapi/ccl/exception.hpp
@@ -44,7 +44,7 @@ class exception : public std::exception {
         msg = std::string("oneCCL: ") + std::string(info);
     }
 
-    const char *what() const noexcept {
+    const char *what() const noexcept override {
         return msg.c_str();
     }
 };
diff --git a/include/oneapi/ccl/native_device_api/interop_utils.hpp b/include/oneapi/ccl/native_device_api/interop_utils.hpp
index 946babad0..02bd77083 100644
--- a/include/oneapi/ccl/native_device_api/interop_utils.hpp
+++ b/include/oneapi/ccl/native_device_api/interop_utils.hpp
@@ -36,6 +36,7 @@ using assoc_result = std::tuple<usm_support_mode, const void*, std::string>;
 enum assoc_result_index { SUPPORT_MODE = 0, POINTER_VALUE, ERROR_CAUSE };
 
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+// TODO: move to src
 assoc_result check_assoc_device_memory(const void* mem,
                                        const ccl::unified_device_type::ccl_native_t& device,
                                        const ccl::unified_context_type::ccl_native_t& ctx);
diff --git a/include/oneapi/ccl/native_device_api/l0/base.hpp b/include/oneapi/ccl/native_device_api/l0/base.hpp
index 74fa79d85..16cb6942d 100644
--- a/include/oneapi/ccl/native_device_api/l0/base.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/base.hpp
@@ -34,6 +34,7 @@ namespace native {
  * Base RAII L0 handles wrappper
  * support serialize/deserialize concept
  */
+
 template <class handle_type, class resource_owner, class cl_context>
 class cl_base {
 public:
diff --git a/include/oneapi/ccl/native_device_api/l0/context.hpp b/include/oneapi/ccl/native_device_api/l0/context.hpp
index 24ae091be..4a49bdc96 100644
--- a/include/oneapi/ccl/native_device_api/l0/context.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/context.hpp
@@ -25,6 +25,8 @@ struct ccl_device_platform;
 struct ccl_device_driver;
 struct ccl_subdevice;
 struct ccl_device;
+struct ccl_event_pool_holder;
+class ccl_event_pool;
 
 // TODO not thread-safe!!!
 struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, ccl_context>,
@@ -42,7 +44,10 @@ struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, cc
     template <class elem_t>
     using host_memory_ptr = std::shared_ptr<host_memory<elem_t>>;
 
+    using ccl_event_pool_ptr = std::shared_ptr<ccl_event_pool>;
+
     ccl_context(handle_t h, owner_ptr_t&& platform);
+    ~ccl_context();
 
     static const ze_host_mem_alloc_desc_t& get_default_host_alloc_desc();
 
@@ -71,12 +76,22 @@ struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, cc
         host_free_memory(static_cast<void*>(mem_handle));
     }
 
+    // event pool
+    ccl_event_pool_ptr create_event_pool(std::initializer_list<ccl_device*> devices,
+                                         const ze_event_pool_desc_t& descr);
+    std::vector<std::shared_ptr<ccl_event_pool>> get_shared_event_pool(
+        std::initializer_list<ccl_device*> devices = {});
+    std::vector<std::shared_ptr<ccl_event_pool>> get_shared_event_pool(
+        std::initializer_list<ccl_device*> devices = {}) const;
+
 private:
     void* host_alloc_memory(size_t bytes_count,
                             size_t alignment,
                             const ze_host_mem_alloc_desc_t& host_desc);
 
     void host_free_memory(void* mem_handle);
+
+    std::shared_ptr<ccl_event_pool_holder> pool_holder;
 };
 
 class context_array_t {
diff --git a/include/oneapi/ccl/native_device_api/l0/declarations.hpp b/include/oneapi/ccl/native_device_api/l0/declarations.hpp
index 07689c7a3..68e9eaa30 100644
--- a/include/oneapi/ccl/native_device_api/l0/declarations.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/declarations.hpp
@@ -23,6 +23,7 @@
 
 #include "oneapi/ccl/native_device_api/l0/context.hpp"
 #include "oneapi/ccl/native_device_api/l0/device.hpp"
+#include "oneapi/ccl/native_device_api/l0/event_pool.hpp"
 #include "oneapi/ccl/native_device_api/l0/subdevice.hpp"
 #include "oneapi/ccl/native_device_api/l0/driver.hpp"
 #include "oneapi/ccl/native_device_api/l0/platform.hpp"
diff --git a/include/oneapi/ccl/native_device_api/l0/device.hpp b/include/oneapi/ccl/native_device_api/l0/device.hpp
index 0995d3e4f..d4f7e24cc 100644
--- a/include/oneapi/ccl/native_device_api/l0/device.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/device.hpp
@@ -50,9 +50,9 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     using const_subdevice_ptr = std::shared_ptr<const ccl_subdevice>;
     using sub_devices_container_type = std::map<ccl::index_type, subdevice_ptr>;
 
-    template <class elem_t>
+    template <class elem_t = uint8_t>
     using device_memory = memory<elem_t, ccl_device, ccl_context>;
-    template <class elem_t>
+    template <class elem_t = uint8_t>
     using device_memory_ptr = std::shared_ptr<memory<elem_t, ccl_device, ccl_context>>;
 
     using device_ipc_memory = ipc_memory<ccl_device, ccl_context>;
@@ -65,7 +65,7 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     using device_cmd_list = cmd_list<ccl_device, ccl_context>;
     using device_module = module<ccl_device, ccl_context>;
     using device_module_ptr = std::shared_ptr<device_module>;
-    using device_event = event<ccl_device, ccl_context>;
+    using device_event = event;
     using indexed_handles = indexed_storage<handle_t>;
 
     ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx);
diff --git a/include/oneapi/ccl/native_device_api/l0/event_pool.hpp b/include/oneapi/ccl/native_device_api/l0/event_pool.hpp
new file mode 100644
index 000000000..343415301
--- /dev/null
+++ b/include/oneapi/ccl/native_device_api/l0/event_pool.hpp
@@ -0,0 +1,103 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <atomic>
+#include <mutex> //TODO use shared
+
+#include "oneapi/ccl/native_device_api/l0/base.hpp"
+#include "oneapi/ccl/native_device_api/l0/primitives.hpp"
+#include "oneapi/ccl/native_device_api/l0/utils.hpp"
+
+namespace native {
+struct ccl_context;
+struct ccl_device;
+class ccl_event_pool;
+
+class event_pool_array_t {
+public:
+    using value_type = std::vector<std::shared_ptr<ccl_event_pool>>;
+    using context_array_accessor = detail::unique_accessor<std::mutex, value_type>;
+    using const_context_array_accessor = detail::unique_accessor<std::mutex, const value_type>;
+
+    context_array_accessor access();
+    const_context_array_accessor access() const;
+
+private:
+    mutable std::mutex m;
+    value_type event_pools;
+};
+
+struct ccl_event_pool_holder {
+    ze_event_pool_handle_t get();
+    std::shared_ptr<ccl_event_pool> emplace(const std::initializer_list<ccl_device*>& devices,
+                                            std::shared_ptr<ccl_event_pool> pool);
+
+    std::vector<std::shared_ptr<ccl_event_pool>> get_event_pool_storage(
+        std::initializer_list<ccl_device*> devices);
+    std::vector<std::shared_ptr<ccl_event_pool>> get_event_pool_storage(
+        std::initializer_list<ccl_device*> devices) const;
+
+    void on_delete(ze_event_pool_handle_t pool_handle, ze_context_handle_t& ctx);
+
+private:
+    mutable std::mutex m;
+    std::map<const ccl_device*, event_pool_array_t> contexts_pool;
+};
+
+class ccl_event_pool : public cl_base<ze_event_pool_handle_t, ccl_event_pool_holder, ccl_context>,
+                       public std::enable_shared_from_this<ccl_event_pool> {
+public:
+    using base = cl_base<ze_event_pool_handle_t, ccl_event_pool_holder, ccl_context>;
+    using handle_t = base::handle_t;
+    using base::owner_t;
+    using base::owner_ptr_t;
+    using base::context_t;
+    using base::context_ptr_t;
+    using event_ptr = std::shared_ptr<event>;
+
+    static const ze_event_desc_t& get_default_event_desc() {
+        static ze_event_desc_t def = {
+            ZE_STRUCTURE_TYPE_EVENT_DESC,
+            nullptr,
+            0, // index
+            0, // no additional memory/cache coherency required on signal
+            ZE_EVENT_SCOPE_FLAG_HOST // ensure memory coherency across device and Host after event completes
+        };
+        return def;
+    }
+
+    ccl_event_pool(const ze_event_pool_desc_t& descr,
+                   handle_t h,
+                   owner_ptr_t&& holder,
+                   context_ptr_t&& ctx);
+    ~ccl_event_pool();
+
+    std::shared_ptr<ccl_event_pool> get_ptr() {
+        return this->shared_from_this();
+    }
+
+    event_ptr create_event(const ze_event_desc_t& descr = ccl_event_pool::get_default_event_desc());
+    void on_delete(ze_event_handle_t event_handle, ze_context_handle_t& ctx);
+
+    const ze_event_pool_desc_t& get_pool_description() const;
+    size_t get_allocated_events() const;
+
+private:
+    ze_event_pool_desc_t pool_description;
+    std::atomic<size_t> allocated_event_count;
+};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/l0/primitives.hpp b/include/oneapi/ccl/native_device_api/l0/primitives.hpp
index a22b2f782..303e15750 100644
--- a/include/oneapi/ccl/native_device_api/l0/primitives.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/primitives.hpp
@@ -23,6 +23,8 @@
 namespace native {
 
 struct ccl_device_platform;
+class ccl_event_pool;
+struct ccl_context;
 
 std::string to_string(const ze_result_t result);
 std::string to_string(ze_memory_type_t type);
@@ -68,8 +70,18 @@ template <class elem_t,
 struct memory;
 */
 
-template <class resource_owner, class cl_context>
-using event = cl_base<ze_event_handle_t, resource_owner, cl_context>;
+struct event : private cl_base<ze_event_handle_t, ccl_event_pool, ccl_context> {
+    using base = cl_base<ze_event_handle_t, ccl_event_pool, ccl_context>;
+    using base::get_owner;
+    using base::get_ctx;
+    using base::handle;
+
+    using base::base;
+
+    bool wait(uint64_t nanosec = std::numeric_limits<uint64_t>::max()) const;
+    ze_result_t status() const;
+    void signal();
+};
 
 template <class elem_t, class resource_owner, class cl_context>
 struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*,
@@ -80,6 +92,8 @@ struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*
     using base::get_ctx;
     using base::handle;
 
+    using event_t = event;
+
     memory(elem_t* h,
            size_t count,
            std::weak_ptr<resource_owner>&& owner,
@@ -98,15 +112,16 @@ struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*
     void enqueue_write_sync(const elem_t* src, int n);
 
     // async
-    queue_fence<resource_owner, cl_context> enqueue_write_async(
-        const std::vector<elem_t>& src,
-        queue<resource_owner, cl_context>& queue);
+    event_t enqueue_write_async(const std::vector<elem_t>& src,
+                                queue<resource_owner, cl_context>& queue);
+    event_t enqueue_write_async(typename std::vector<elem_t>::const_iterator first,
+                                typename std::vector<elem_t>::const_iterator last);
     template <int N>
-    queue_fence<resource_owner, cl_context> enqueue_write_async(
-        const std::array<elem_t, N>& src,
-        queue<resource_owner, cl_context>& queue);
-    queue_fence<resource_owner, cl_context>
-    enqueue_write_async(const elem_t* src, size_t n, queue<resource_owner, cl_context>& queue);
+    event_t enqueue_write_async(const std::array<elem_t, N>& src,
+                                queue<resource_owner, cl_context>& queue);
+    event_t enqueue_write_async(const elem_t* src,
+                                size_t n,
+                                queue<resource_owner, cl_context>& queue);
 
     // sync memory-copy read
     std::vector<elem_t> enqueue_read_sync(size_t requested_size = 0) const;
diff --git a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
index 0ccbf3bfb..76137fae4 100644
--- a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
@@ -27,6 +27,7 @@ namespace native {
 
 struct ccl_device;
 namespace detail {
+
 void copy_memory_sync_unsafe(void* dst,
                              const void* src,
                              size_t size,
@@ -37,6 +38,19 @@ void copy_memory_sync_unsafe(void* dst,
                              size_t size,
                              std::weak_ptr<ccl_context> ctx_weak,
                              std::shared_ptr<ccl_context> ctx);
+
+event copy_memory_async_unsafe(void* dst,
+                               const void* src,
+                               size_t size,
+                               std::weak_ptr<ccl_device> device_weak,
+                               std::shared_ptr<ccl_context> ctx,
+                               queue<ccl_device, ccl_context>& q);
+event copy_memory_async_unsafe(void* dst,
+                               const void* src,
+                               size_t size,
+                               std::weak_ptr<ccl_context> ctx_weak,
+                               std::shared_ptr<ccl_context> ctx,
+                               queue<ccl_device, ccl_context>& q);
 } // namespace detail
 
 template <TEMPLATE_DECL_ARG>
@@ -66,7 +80,82 @@ template <TEMPLATE_DECL_ARG>
 size_t memory<TEMPLATE_DEF_ARG>::size() const noexcept {
     return count() * sizeof(elem_t);
 }
+/*
+// async operations
+template <TEMPLATE_DECL_ARG>
+typename memory<TEMPLATE_DEF_ARG>::event_t
+memory<TEMPLATE_DEF_ARG>::enqueue_write_async(
+        const std::vector<elem_t>& src,
+        queue<resource_owner, cl_context>& queue)
+{
+    if (count() < src.size()) {
+        throw std::length_error(
+            std::string(__PRETTY_FUNCTION__) +
+            "\nCannot process 'enqueue_write_async', because memory has not enough size" +
+            ", expected: " + std::to_string(count()) +
+            ", requested: " + std::to_string(src.size()));
+    }
+
+    TODO
+}
 
+template <TEMPLATE_DECL_ARG>
+typename memory<TEMPLATE_DEF_ARG>::event_t
+enqueue_write_async(typename std::vector<elem_t>::const_iterator first,
+                    typename std::vector<elem_t>::const_iterator last,
+                    queue<resource_owner, cl_context>& queue)
+{
+    size_t requested_size = std::distance(first, last);
+    if (count() < requested_size) {
+        throw std::length_error(
+            std::string(__PRETTY_FUNCTION__) +
+            "\nCannot process 'enqueue_write_async', because memory has not enough size" +
+            ", expected: " + std::to_string(count()) +
+            ", requested range size: " + std::to_string(requested_size));
+    }
+    TODO
+}
+
+template <TEMPLATE_DECL_ARG>
+template <int N>
+typename memory<TEMPLATE_DEF_ARG>::event_t
+memory<TEMPLATE_DEF_ARG>::enqueue_write_async(
+        const std::array<elem_t, N>& src,
+        queue<resource_owner, cl_context>& queue)
+{
+    if (count() < N) {
+        throw std::length_error(
+            std::string(__PRETTY_FUNCTION__) +
+            "\nCannot process 'enqueue_write_async', because memory has not enough size" +
+            ", expected: " + std::to_string(count()) +
+            ", requested array count: " + std::to_string(N));
+    }
+
+    TODO
+}
+
+template <TEMPLATE_DECL_ARG>
+typename memory<TEMPLATE_DEF_ARG>::event_t
+memory<TEMPLATE_DEF_ARG>::enqueue_write_async(const elem_t* src, size_t src_elem_count, queue<resource_owner, cl_context>& queue)
+{
+    if (!src) {
+        throw std::invalid_argument(
+            std::string(__PRETTY_FUNCTION__) +
+            "\nCannot process 'enqueue_write_async', because 'src' is 'nullptr'");
+    }
+
+    if (count() < src_elem_count) {
+        throw std::length_error(
+            std::string(__PRETTY_FUNCTION__) +
+            "\nCannot process 'enqueue_write_async', because memory has not enough size" +
+            ", expected: " + std::to_string(count()) +
+            ", requested c-array count: " + std::to_string(src_elem_count * sizeof(elem_t)));
+    }
+
+    TODO
+}
+*/
+// sync operations
 template <TEMPLATE_DECL_ARG>
 void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const std::vector<elem_t>& src) {
     if (count() < src.size()) {
@@ -135,11 +224,11 @@ void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const elem_t* src, size_t src_
             "\nCannot process 'enqueue_write_sync', because 'src' is 'nullptr'");
     }
 
-    if (count() < src_elem_count * sizeof(elem_t)) {
+    if (size() < src_elem_count * sizeof(elem_t)) {
         throw std::length_error(
             std::string(__PRETTY_FUNCTION__) +
             "\nCannot process 'enqueue_write_sync', because memory has not enough size" +
-            ", expected: " + std::to_string(count()) +
+            ", expected: " + std::to_string(size()) +
             ", requested: " + std::to_string(src_elem_count * sizeof(elem_t)));
     }
 
diff --git a/include/oneapi/ccl/types_policy.hpp b/include/oneapi/ccl/types_policy.hpp
index 40e3c9228..416a65dfe 100644
--- a/include/oneapi/ccl/types_policy.hpp
+++ b/include/oneapi/ccl/types_policy.hpp
@@ -15,6 +15,8 @@
 */
 #pragma once
 
+#include <memory>
+
 namespace ccl {
 template <class impl_t>
 class non_copyable {
diff --git a/mpi/bin/hydra_bstrap_proxy b/mpi/bin/hydra_bstrap_proxy
deleted file mode 100755
index bb46998fe..000000000
Binary files a/mpi/bin/hydra_bstrap_proxy and /dev/null differ
diff --git a/mpi/bin/hydra_pmi_proxy b/mpi/bin/hydra_pmi_proxy
deleted file mode 100755
index 2b6be410f..000000000
Binary files a/mpi/bin/hydra_pmi_proxy and /dev/null differ
diff --git a/mpi/bin/mpiexec b/mpi/bin/mpiexec
deleted file mode 120000
index 482a69296..000000000
--- a/mpi/bin/mpiexec
+++ /dev/null
@@ -1 +0,0 @@
-mpiexec.hydra
\ No newline at end of file
diff --git a/mpi/bin/mpiexec.hydra b/mpi/bin/mpiexec.hydra
deleted file mode 100755
index 317678932..000000000
Binary files a/mpi/bin/mpiexec.hydra and /dev/null differ
diff --git a/mpi/etc/tuning_clx-ap_shm.dat b/mpi/etc/tuning_clx-ap_shm.dat
deleted file mode 100755
index 35991b50a..000000000
Binary files a/mpi/etc/tuning_clx-ap_shm.dat and /dev/null differ
diff --git a/mpi/lib/libmpi.so b/mpi/lib/libmpi.so
deleted file mode 120000
index 9e4b9f431..000000000
--- a/mpi/lib/libmpi.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0
\ No newline at end of file
diff --git a/mpi/lib/libmpi.so.12 b/mpi/lib/libmpi.so.12
deleted file mode 120000
index 5a0e391d4..000000000
--- a/mpi/lib/libmpi.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpi.so.12.0 b/mpi/lib/libmpi.so.12.0
deleted file mode 120000
index 5a0e391d4..000000000
--- a/mpi/lib/libmpi.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so b/mpi/lib/libmpicxx.so
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so.12 b/mpi/lib/libmpicxx.so.12
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so.12.0 b/mpi/lib/libmpicxx.so.12.0
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so b/mpi/lib/libmpifort.so
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so.12 b/mpi/lib/libmpifort.so.12
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so.12.0 b/mpi/lib/libmpifort.so.12.0
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/ofi/bin/fi_info b/ofi/bin/fi_info
deleted file mode 100755
index 1dc3a2707..000000000
Binary files a/ofi/bin/fi_info and /dev/null differ
diff --git a/ofi/lib/libfabric.so b/ofi/lib/libfabric.so
deleted file mode 120000
index 878a6164e..000000000
--- a/ofi/lib/libfabric.so
+++ /dev/null
@@ -1 +0,0 @@
-libfabric.so.1
\ No newline at end of file
diff --git a/ofi/lib/libfabric.so.1 b/ofi/lib/libfabric.so.1
deleted file mode 100755
index eea8f4068..000000000
Binary files a/ofi/lib/libfabric.so.1 and /dev/null differ
diff --git a/ofi/lib/prov/libpsmx2-fi.so b/ofi/lib/prov/libpsmx2-fi.so
deleted file mode 100755
index b2afe721e..000000000
Binary files a/ofi/lib/prov/libpsmx2-fi.so and /dev/null differ
diff --git a/ofi/lib/prov/librxm-fi.so b/ofi/lib/prov/librxm-fi.so
deleted file mode 100755
index 6fe75fd9c..000000000
Binary files a/ofi/lib/prov/librxm-fi.so and /dev/null differ
diff --git a/ofi/lib/prov/libshm-fi.so b/ofi/lib/prov/libshm-fi.so
deleted file mode 100755
index 71c0c931f..000000000
Binary files a/ofi/lib/prov/libshm-fi.so and /dev/null differ
diff --git a/ofi/lib/prov/libsockets-fi.so b/ofi/lib/prov/libsockets-fi.so
deleted file mode 100755
index d548a6979..000000000
Binary files a/ofi/lib/prov/libsockets-fi.so and /dev/null differ
diff --git a/ofi/lib/prov/libtcp-fi.so b/ofi/lib/prov/libtcp-fi.so
deleted file mode 100755
index f9873e173..000000000
Binary files a/ofi/lib/prov/libtcp-fi.so and /dev/null differ
diff --git a/ofi/lib/prov/libverbs-fi.so b/ofi/lib/prov/libverbs-fi.so
deleted file mode 100755
index 471d97946..000000000
Binary files a/ofi/lib/prov/libverbs-fi.so and /dev/null differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ecf3e257c..356e0b49a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-#builds ccl
+# builds CCL
 
 set (EXTENSIONS_SRC)
 
@@ -36,6 +36,7 @@ list (APPEND EXTENSIONS_SRC
                     native_device_api/l0/base.cpp
                     native_device_api/l0/device.cpp
                     native_device_api/l0/context.cpp
+                    native_device_api/l0/event_pool.cpp
                     native_device_api/l0/subdevice.cpp
                     native_device_api/l0/driver.cpp
                     native_device_api/l0/export.cpp
@@ -80,16 +81,17 @@ list (APPEND EXTENSIONS_SRC
                     common/comm/l0/topology/ring/process_group_ring_creator.cpp
                     common/comm/l0/topology/topology_construction_utils.cpp
 
-                    common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp
-                    common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp
-                    common/comm/l0/context/scaling_ctx/ipc_session_key.cpp
+                    common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
+                    common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
+                    common/comm/l0/context/scale/ipc/ipc_session_key.cpp
 
-                    common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp
-                    common/comm/l0/context/scaling_ctx/observer_session_key.cpp
+                    common/comm/l0/context/scale/base/base_session.cpp
+                    common/comm/l0/context/scale/scale_out/scale_out_session.cpp
 
                     common/comm/l0/gpu_comm_attr.cpp
                     common/comm/l0/modules/base_entry_module.cpp
-                    common/comm/l0/modules/modules_source_data.cpp)
+                    common/comm/l0/modules/modules_source_data.cpp
+                    common/comm/l0/modules/kernel_utils.cpp)
 endif(MULTI_GPU_SUPPORT)
 
 set(CCL_SRC
@@ -117,7 +119,6 @@ set(CCL_SRC
     ccl_empty_stream.cpp
     native_device_api/sycl_l0/export.cpp
     native_device_api/empty/export.cpp
-    atl/atl.cpp
     atl/atl_wrapper.cpp
     atl/mpi/atl_mpi.cpp
     atl/ofi/atl_ofi.cpp
@@ -147,10 +148,12 @@ set(CCL_SRC
     coll/ccl_reduce_scatter_op_attr.cpp
     coll/ccl_sparse_allreduce_op_attr.cpp
     coll/ccl_barrier_op_attr.cpp
+    coll/coll_param.cpp
     coll/algorithms/allgatherv.cpp
     coll/algorithms/allreduce/allreduce.cpp
     coll/algorithms/allreduce/allreduce_2d.cpp
     coll/algorithms/allreduce/allreduce_rma.cpp
+    coll/algorithms/algorithm_utils.cpp
     coll/algorithms/alltoall.cpp
     coll/algorithms/alltoallv.cpp
     coll/algorithms/barrier.cpp
@@ -173,19 +176,21 @@ set(CCL_SRC
     comp/comp.cpp
     comp/fp16/fp16.cpp
     comp/fp16/fp16_intrisics.cpp
+    hwloc/hwloc_wrapper.c
     sched/sched.cpp
     sched/extra_sched.cpp
     sched/master_sched.cpp
     sched/sched_base.cpp
     sched/cache/cache.cpp
     sched/cache/key.cpp
+    sched/queue/flow_control.cpp
     sched/queue/strict_queue.cpp
     sched/queue/queue.cpp
     sched/entry/coll/coll_entry.cpp
     sched/entry/coll/coll_entry_helper.cpp
+    sched/entry/copy/copy_helper.cpp
     sched/entry/entry.cpp
     sched/entry/factory/chunked_entry_factory.cpp
-    sched/entry/sycl_entry_helper.cpp
     exec/exec.cpp
     exec/thread/base_thread.cpp
     exec/thread/listener.cpp
@@ -220,18 +225,21 @@ set(CCL_SRC
     ${EXTENSIONS_SRC})
 
 list(APPEND CCL_INC_DIRS
-                 ${PROJECT_SOURCE_DIR}/include
-                 ${PROJECT_SOURCE_DIR}/mpi/include
-                 ${LIBFABRIC_INCLUDE_DIR}
-                 ${PROJECT_SOURCE_DIR}/src
-                 ${PROJECT_SOURCE_DIR}/src/atl)
-
-message(STATUS "CCL_INC_DIRS: ${CCL_INC_DIRS}")
-message(STATUS "oneCCL lib LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}")
-message(STATUS "oneCCL lib LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}")
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+     ${PROJECT_SOURCE_DIR}/include
+     ${MPI_INCLUDE_DIR}
+     ${LIBFABRIC_INCLUDE_DIR}
+     ${HWLOC_INCLUDE_DIR}
+     ${PROJECT_SOURCE_DIR}/src
+     ${PROJECT_SOURCE_DIR}/src/atl)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SRC_C_FLAGS} -pthread")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SRC_CXX_FLAGS} -pthread")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SRC_SHARED_LINKER_FLAGS}")
+
+message(STATUS "SRC C_FLAGS: ${CMAKE_C_FLAGS}")
+message(STATUS "SRC CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+message(STATUS "SRC SHARED_LINKER_FLAGS: ${CMAKE_SHARED_LINKER_FLAGS}")
+message(STATUS "SRC INC_DIRS: ${CCL_INC_DIRS}")
 
 #special library that holds objects only
 add_library(ccl-objects OBJECT ${CCL_SRC})
@@ -243,19 +251,27 @@ if(COMPUTE_BACKEND_TARGET_NAME)
 endif()
 
 # add library search directory
-link_directories(${PROJECT_SOURCE_DIR}/mpi/lib)
+link_directories(${MPI_LIB_DIR})
 link_directories(${LIBFABRIC_LIB_DIR})
 
-#shared library
+# shared library
 add_library(ccl SHARED $<TARGET_OBJECTS:ccl-objects>)
 target_include_directories(ccl PUBLIC ${CCL_INC_DIRS})
 
 # link with release_mt libmpi.so for oneAPI Base toolkit
-# libccl.so -> cpu_icc/cpu_gpu_dpcpp -> lib -> latest -> ccl -> mpi -> ... 
+# libccl.so -> cpu_icc/cpu_gpu_dpcpp -> lib -> latest -> ccl -> mpi -> ...
 set(ONEAPI_IMPI_RPATH "'$ORIGIN'/../../../../mpi/latest/lib/release_mt/")
 set_target_properties(ccl PROPERTIES LINK_FLAGS "-Wl,-rpath,${ONEAPI_IMPI_RPATH}")
 
-target_link_libraries(ccl PUBLIC dl pthread ${EXTERNAL_LIBS} ${COMPUTE_BACKEND_TARGET_NAME} fabric mpi)
+target_link_libraries(ccl PUBLIC
+                      dl
+                      pthread
+                      fabric
+                      mpi
+                      ${HWLOC_LIB_DIR}/libhwloc.a
+                      ${EXTERNAL_LIBS}
+                      ${COMPUTE_BACKEND_TARGET_NAME})
+
 if (NOT LIB_SO_VERSION AND NOT LIB_MAJOR_VERSION)
         set_target_properties(ccl PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
 else()
@@ -264,39 +280,40 @@ endif()
 
 install(TARGETS ccl LIBRARY DESTINATION ${CCL_INSTALL_LIB})
 install(FILES
-    "../cmake/FindOpenCL.cmake"
-    "../cmake/Findlevel_zero.cmake"
-    "../cmake/FindSYCL.cmake"
-    "../cmake/FindIntelSYCL.cmake"
-    "../cmake/FindIntelSYCL_level_zero.cmake"
-    "../cmake/FindComputeCpp.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindComputeCpp.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL_level_zero.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/Findlevel_zero.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindNUMA.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindOpenCL.cmake"
+    "${PROJECT_SOURCE_DIR}/cmake/FindSYCL.cmake"
     DESTINATION ${CCL_INSTALL_LIB})
 
-#static library
+# static library
 add_library(ccl-static STATIC $<TARGET_OBJECTS:ccl-objects>)
 set_target_properties(ccl-static PROPERTIES OUTPUT_NAME ccl)
 set_target_properties(ccl-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
 install(TARGETS ccl-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB} OPTIONAL)
 
-#headers installation
+# API headers
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/
         DESTINATION ${CCL_INSTALL_INCLUDE} FILES_MATCHING REGEX ".*\\.(h|hpp)$")
 
-#mpi & ofi rt
-file(GLOB mpi_bins "${PROJECT_SOURCE_DIR}/mpi/bin/*")
+# MPI and OFI runtimes
+file(GLOB mpi_bins "${DEPS_DIR}/mpi/bin/*")
 install(PROGRAMS ${mpi_bins} DESTINATION ${CCL_INSTALL_BIN})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/ofi/lib/
+install(DIRECTORY ${DEPS_DIR}/ofi/lib/
         DESTINATION ${CCL_INSTALL_LIB})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/include/
+install(DIRECTORY ${DEPS_DIR}/mpi/include/
         DESTINATION ${CCL_INSTALL_INCLUDE})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/lib/
+install(DIRECTORY ${DEPS_DIR}/mpi/lib/
         DESTINATION ${CCL_INSTALL_LIB})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/etc/
+install(DIRECTORY ${DEPS_DIR}/mpi/etc/
         DESTINATION ${CCL_INSTALL_ETC})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/mpi/licensing/
+install(DIRECTORY ${DEPS_DIR}/mpi/licensing/
         DESTINATION ${CCL_INSTALL_LICENSE}/mpi/)
diff --git a/src/atl/atl.cpp b/src/atl/atl.cpp
deleted file mode 100644
index b43525f1f..000000000
--- a/src/atl/atl.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <algorithm>
-#include <assert.h>
-#include <cstring>
-#include <dirent.h>
-#include <dlfcn.h>
-
-#include "atl/atl.h"
-
-#define LIB_SUFFIX     ".so"
-#define ATL_LIB_PREFIX "libccl_atl_"
-
-static int initialized = 0;
-static int should_reserve_addr = 0;
-
-static int atl_lib_filter(const struct dirent* entry) {
-    size_t entry_len = strlen(entry->d_name);
-    size_t sfx_len = strlen(LIB_SUFFIX);
-    const char* sfx_ptr;
-
-    if (entry_len > sfx_len) {
-        sfx_ptr = strstr((entry->d_name), LIB_SUFFIX);
-
-        if (strstr((entry->d_name), ATL_LIB_PREFIX) && sfx_ptr && (strlen(sfx_ptr) == sfx_len))
-            return 1;
-        else
-            return 0;
-    }
-    else
-        return 0;
-}
-
-static void atl_ini_dir(const char* transport_name,
-                        int* argc,
-                        char*** argv,
-                        atl_attr_t* attr,
-                        atl_ctx_t** ctx,
-                        const char* dir,
-                        const char* main_addr) {
-    CCL_THROW("unexpected path");
-
-    int n = 0;
-    char* lib;
-    void* dlhandle;
-    struct dirent** liblist = NULL;
-    typedef atl_status_t (*init_f)(atl_transport_t*);
-    init_f init_func;
-    size_t transport_name_len = strlen(transport_name);
-
-    n = scandir(dir, &liblist, atl_lib_filter, NULL);
-    if (n < 0)
-        goto libdl_done;
-
-    while (n--) {
-        if (asprintf(&lib, "%s/%s", dir, liblist[n]->d_name) < 0)
-            goto libdl_done;
-
-        LOG_DEBUG("opening lib ", lib);
-        dlhandle = dlopen(lib, RTLD_NOW);
-        free(liblist[n]);
-        if (dlhandle == NULL) {
-            LOG_ERROR("can't open lib ", lib, ", error ", dlerror());
-            free(lib);
-            continue;
-        }
-
-        init_func = reinterpret_cast<init_f>(dlsym(dlhandle, "atl_ini"));
-        if (init_func == NULL) {
-            dlclose(dlhandle);
-            free(lib);
-        }
-        else {
-            LOG_DEBUG("lib ", lib, " contains necessary symbol");
-            free(lib);
-
-            atl_transport_t transport;
-            atl_status_t ret;
-
-            if ((init_func)(&transport) != ATL_STATUS_SUCCESS) {
-                dlclose(dlhandle);
-                continue;
-            }
-
-            if (strncmp(transport.name,
-                        transport_name,
-                        std::min(transport_name_len, strlen(transport.name)))) {
-                dlclose(dlhandle);
-                continue;
-            }
-
-            if (should_reserve_addr) {
-                ret = transport.reserve_addr(const_cast<char*>(main_addr));
-            }
-            else {
-                ret = transport.init(argc, argv, attr, ctx, main_addr, nullptr /* pmi */);
-            }
-            if (ret != ATL_STATUS_SUCCESS) {
-                dlclose(dlhandle);
-                continue;
-            }
-
-            break;
-        }
-    }
-
-libdl_done:
-    while (n-- > 0)
-        free(liblist[n]);
-    free(liblist);
-}
-
-/*
-    Split the given string "s" using the specified delimiter(s) in the string
-    "delim" and return an array of strings. The array is terminated with a NULL
-    pointer. Returned array should be freed with ofi_free_string_array().
-
-    Returns NULL on failure.
- */
-static char** atl_split_and_alloc(const char* s, const char* delim, size_t* count) {
-    int i, n;
-    char* tmp;
-    char* dup = NULL;
-    char** arr = NULL;
-
-    if (!s || !delim)
-        return NULL;
-
-    dup = strdup(s);
-    if (!dup)
-        return NULL;
-
-    /* compute the array size */
-    n = 1;
-    for (tmp = dup; *tmp != '\0'; ++tmp) {
-        for (i = 0; delim[i] != '\0'; ++i) {
-            if (*tmp == delim[i]) {
-                ++n;
-                break;
-            }
-        }
-    }
-
-    /* +1 to leave space for NULL terminating pointer */
-    arr = static_cast<char**>(calloc(n + 1, sizeof(*arr)));
-    if (!arr)
-        goto cleanup;
-
-    /* set array elts to point inside the dup'ed string */
-    for (tmp = dup, i = 0; tmp != NULL; ++i)
-        arr[i] = strsep(&tmp, delim);
-
-    assert(i == n);
-
-    if (count)
-        *count = n;
-
-    return arr;
-
-cleanup:
-    free(dup);
-    return NULL;
-}
-
-/* see atl_split_and_alloc() */
-static void atl_free_string_array(char** s) {
-    /* all strings are allocated from the same strdup'ed slab, so just free
-     * the first element */
-    if (s != NULL)
-        free(s[0]);
-
-    /* and then the actual array of pointers */
-    free(s);
-}
-
-atl_status_t atl_init(const char* transport_name,
-                      int* argc,
-                      char*** argv,
-                      atl_attr_t* attr,
-                      atl_ctx_t** ctx,
-                      const char* main_addr) {
-    CCL_THROW("unexpected path");
-
-    const char* transport_dl_dir = NULL;
-    int n = 0;
-    char** dirs;
-    void* dlhandle;
-
-    if (initialized)
-        return ATL_STATUS_FAILURE;
-
-    dlhandle = dlopen(NULL, RTLD_NOW);
-    if (dlhandle == NULL)
-        goto err_dlopen;
-
-    dlclose(dlhandle);
-
-    dirs = atl_split_and_alloc(transport_dl_dir, ":", NULL);
-    if (dirs) {
-        for (n = 0; dirs[n]; ++n) {
-            atl_ini_dir(transport_name, argc, argv, attr, ctx, dirs[n], main_addr);
-        }
-        atl_free_string_array(dirs);
-    }
-
-    return ATL_STATUS_SUCCESS;
-
-err_dlopen:
-    return ATL_STATUS_FAILURE;
-}
-
-void atl_main_addr_reserve(char* main_addr) {
-    CCL_THROW("unexpected path");
-    should_reserve_addr = 1;
-    atl_init("ofi", NULL, NULL, NULL, NULL, main_addr);
-    should_reserve_addr = 0;
-}
diff --git a/src/atl/atl.h b/src/atl/atl.h
index 66895cd61..c515fa483 100644
--- a/src/atl/atl.h
+++ b/src/atl/atl.h
@@ -30,7 +30,7 @@ class iatl {
 
     virtual atl_status_t atl_init(int* argc,
                                   char*** argv,
-                                  atl_attr_t* att,
+                                  atl_attr_t* attr,
                                   const char* main_addr,
                                   std::unique_ptr<ipmi>& pmi) = 0;
 
diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h
index 8675046c1..5fd6ab20c 100644
--- a/src/atl/atl_def.h
+++ b/src/atl/atl_def.h
@@ -48,6 +48,15 @@
 #define ATL_OFI_INI ATL_EXT_INI
 #define ATL_MPI_INI ATL_EXT_INI
 
+#define ATL_CALL(func, err_action) \
+    do { \
+        atl_status_t status = func; \
+        if (status != FI_SUCCESS) { \
+            LOG_ERROR(#func "\n fails with status: ", status); \
+            err_action; \
+        } \
+    } while (0)
+
 class ipmi;
 
 typedef struct atl_ctx atl_ctx_t;
@@ -100,15 +109,29 @@ typedef enum {
     ATL_REDUCTION_CUSTOM
 } atl_reduction_t;
 
+typedef enum { ATL_MNIC_NONE, ATL_MNIC_LOCAL, ATL_MNIC_GLOBAL } atl_mnic_t;
+
 typedef struct {
-    size_t ep_count;
-    int enable_shm;
-    size_t tag_bits;
-    uint64_t max_tag;
-    int enable_rma;
-    size_t max_order_waw_size;
-    int sync_coll;
-    int extra_ep;
+    struct {
+        int enable_shm;
+        int enable_rma;
+        int enable_device_buf;
+        int enable_sync_coll;
+        int enable_extra_ep;
+        size_t ep_count;
+        atl_mnic_t mnic_type;
+        size_t mnic_count;
+    } in;
+    struct {
+        int enable_shm;
+        int enable_rma;
+        int enable_device_buf;
+        atl_mnic_t mnic_type;
+        size_t mnic_count;
+        size_t tag_bits;
+        uint64_t max_tag;
+        size_t max_order_waw_size;
+    } out;
 } atl_attr_t;
 
 typedef struct {
diff --git a/src/atl/atl_wrapper.cpp b/src/atl/atl_wrapper.cpp
index 5ceaea8f0..fc1bed457 100644
--- a/src/atl/atl_wrapper.cpp
+++ b/src/atl/atl_wrapper.cpp
@@ -28,14 +28,20 @@ static std::list<std::shared_ptr<iatl>> transports{};
 static ccl_executor* executor;
 
 atl_attr_t atl_wrapper::attr = {
-    1, /* ep_count */
-    1, /* enable_shm */
-    64, /* tag_bits */
-    0xFFFFFFFFFFFFFFFF, /* max_tag */
-    0, /* enable_rma */
-    0, /* max_order_waw_size */
-    0, /* sync_coll */
-    0 /* extra_ep */
+    /* in */
+    {
+        0, /* enable_shm */
+        0, /* enable_rma */
+        0, /* enable_device_buf */
+        0, /* enable_sync_coll */
+        0, /* enable_extra_ep */
+        1, /* ep_count */
+        ATL_MNIC_NONE, /* mnic_type */
+        1 /* mnic_count */
+    },
+
+    /* out */
+    {}
 };
 
 void atl_wrapper::set_internal_env(const atl_attr_t& attr) {
@@ -149,7 +155,7 @@ atl_wrapper::atl_wrapper(int total_rank_count,
     init_transport();
 }
 void atl_wrapper::init_transport() {
-    LOG_DEBUG("init ATL, requested ep_count ", attr.ep_count);
+    LOG_DEBUG("init ATL, requested ep_count ", attr.in.ep_count);
     static std::mutex memory_mutex;
     {
         std::lock_guard<std::mutex> lock(memory_mutex);
@@ -160,7 +166,7 @@ void atl_wrapper::init_transport() {
         }
     }
     eps = transport->atl_get_eps();
-    tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.tag_bits, attr.max_tag));
+    tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
 
     if (pmi) {
         threads_per_process = pmi->get_threads_per_process();
@@ -177,13 +183,25 @@ void atl_wrapper::init_transport() {
 
     if (rank == 0) {
         tag->print();
-        LOG_INFO("atl-parameters:");
-        LOG_INFO("  ep_count: ", attr.ep_count);
-        LOG_INFO("  enable_shm: ", attr.enable_shm);
-        LOG_INFO("  enable_rma: ", attr.enable_rma);
-        LOG_INFO("  max_order_waw_size: ", attr.max_order_waw_size);
-        LOG_INFO("  sync_coll: ", attr.sync_coll);
-        LOG_INFO("  extra_ep: ", attr.extra_ep);
+        LOG_INFO("atl-in-attrs:");
+        LOG_INFO("  enable_shm: ", attr.in.enable_shm);
+        LOG_INFO("  enable_rma: ", attr.in.enable_rma);
+        LOG_INFO("  enable_device_buf: ", attr.in.enable_device_buf);
+        LOG_INFO("  enable_sync_coll: ", attr.in.enable_sync_coll);
+        LOG_INFO("  enable_extra_ep: ", attr.in.enable_extra_ep);
+        LOG_INFO("  ep_count: ", attr.in.ep_count);
+        LOG_INFO("  mnic_type: ", attr.in.mnic_type);
+        LOG_INFO("  mnic_count: ", attr.in.mnic_count);
+
+        LOG_INFO("atl-out-attrs:");
+        LOG_INFO("  enable_shm: ", attr.out.enable_shm);
+        LOG_INFO("  enable_rma: ", attr.out.enable_rma);
+        LOG_INFO("  enable_device_buf: ", attr.out.enable_device_buf);
+        LOG_INFO("  mnic_type: ", attr.out.mnic_type);
+        LOG_INFO("  mnic_count: ", attr.out.mnic_count);
+        LOG_INFO("  tag_bits: ", attr.out.tag_bits);
+        LOG_INFO("  max_tag: ", attr.out.max_tag);
+        LOG_INFO("  max_order_waw_size: ", attr.out.max_order_waw_size);
     }
 
     if ((!pmi) || (pmi && pmi->get_local_thread_idx() == 0)) {
diff --git a/src/atl/atl_wrapper.h b/src/atl/atl_wrapper.h
index aee1ad744..5edad0aa3 100644
--- a/src/atl/atl_wrapper.h
+++ b/src/atl/atl_wrapper.h
@@ -40,14 +40,6 @@ class atl_wrapper {
                 const std::vector<int>& ranks,
                 std::shared_ptr<ikvs_wrapper> k);
 
-    //    atl_status_t
-    //    atl_init(int* argc, char*** argv,
-    //             atl_attr_t* att,
-    //             const char* main_addr)
-    //    {
-    //        return transport->atl_init(argc, argv, att, main_addr, pmi);
-    //    }
-
     atl_status_t atl_main_addr_reserve(char* main_addr) {
         if (!pmi)
             return ATL_STATUS_UNSUPPORTED;
diff --git a/src/atl/mpi/atl_mpi.hpp b/src/atl/mpi/atl_mpi.hpp
index ab02a5da8..1a98e3419 100644
--- a/src/atl/mpi/atl_mpi.hpp
+++ b/src/atl/mpi/atl_mpi.hpp
@@ -24,7 +24,7 @@ class atl_mpi final : public iatl {
 
     atl_status_t atl_init(int* argc,
                           char*** argv,
-                          atl_attr_t* att,
+                          atl_attr_t* attr,
                           const char* main_addr,
                           std::unique_ptr<ipmi>& pmi) override;
 
diff --git a/src/atl/mpi/atl_mpi_impl.cpp b/src/atl/mpi/atl_mpi_impl.cpp
index 9a312d514..923d538bb 100644
--- a/src/atl/mpi/atl_mpi_impl.cpp
+++ b/src/atl/mpi/atl_mpi_impl.cpp
@@ -33,16 +33,22 @@
 
 #define ATL_MPI_PM_KEY "atl-mpi"
 
-#define EP_IDX_KEY          "ep_idx"
-#define NIC_IDX_KEY         "pref_nic"
-#define NIC_COUNT_KEY       "num_nics"
-#define CLOSE_NIC_IDX_KEY   "pref_close_nic"
-#define CLOSE_NIC_COUNT_KEY "num_close_nics"
+#define EP_IDX_KEY "ep_idx"
+
+#define GLOBAL_NIC_IDX_KEY   "pref_nic"
+#define GLOBAL_NIC_COUNT_KEY "num_nics"
+#define LOCAL_NIC_IDX_KEY    "pref_close_nic"
+#define LOCAL_NIC_COUNT_KEY  "num_close_nics"
 
 #define RET2ATL(ret) (ret != MPI_SUCCESS) ? ATL_STATUS_FAILURE : ATL_STATUS_SUCCESS
 
 typedef enum { ATL_MPI_LIB_IMPI, ATL_MPI_LIB_MPICH, ATL_MPI_LIB_NONE } atl_mpi_lib_type_t;
 
+typedef struct {
+    atl_mpi_lib_type_t type;
+    int device_buf;
+} atl_mpi_lib_attr_t;
+
 typedef struct {
     atl_mpi_lib_type_t type;
     const char* name;
@@ -56,6 +62,9 @@ typedef struct {
     /* minimal expected version of library, mandatory */
     int min_version_value;
 
+    /* minimal expected version of library with device_buf support, mandatory */
+    int min_device_buf_version_value;
+
     /* string prefix before library kind, optional */
     const char* kind_prefix;
 
@@ -66,9 +75,16 @@ typedef struct {
 #define MPI_LIB_INFO_MAX_COUNT 3
 
 static atl_mpi_lib_info_t mpi_lib_infos[MPI_LIB_INFO_MAX_COUNT] = {
-    { ATL_MPI_LIB_IMPI, "impi", "Intel(R) MPI Library", NULL, 2019, "library kind:", "release_mt" },
-    { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, NULL, NULL },
-    { ATL_MPI_LIB_NONE, "none", "", NULL, 0, NULL, NULL },
+    { ATL_MPI_LIB_IMPI,
+      "impi",
+      "Intel(R) MPI Library",
+      NULL,
+      2019,
+      2021,
+      "library kind:",
+      "release_mt" },
+    { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, -1, NULL, NULL },
+    { ATL_MPI_LIB_NONE, "none", "", NULL, 0, -1, NULL, NULL },
 };
 
 #ifdef CCL_BF16_COMPILER
@@ -102,20 +118,22 @@ typedef struct {
 typedef struct atl_mpi_global_data {
     int is_external_init;
     size_t ctx_count;
-    atl_mpi_lib_type_t mpi_lib_type;
     int extra_ep;
-    size_t nic_count;
-    size_t close_nic_count;
+    atl_mnic_t mnic_type;
+    size_t mnic_count;
+    atl_mpi_lib_attr_t mpi_lib_attr;
     atl_mpi_bf16_data_t bf16;
     atl_mpi_fp16_data_t fp16;
 
     atl_mpi_global_data()
             : is_external_init(0),
               ctx_count(0),
-              mpi_lib_type(ATL_MPI_LIB_NONE),
               extra_ep(0),
-              nic_count(1),
-              close_nic_count(1) {
+              mnic_type(ATL_MNIC_NONE),
+              mnic_count(1) {
+        mpi_lib_attr.type = ATL_MPI_LIB_NONE;
+        mpi_lib_attr.device_buf = 0;
+
         bf16.dtype = MPI_DATATYPE_NULL;
         bf16.sum_op = MPI_OP_NULL;
         bf16.prod_op = MPI_OP_NULL;
@@ -536,8 +554,9 @@ static MPI_Op atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype) {
     }
 }
 
-atl_mpi_lib_type_t atl_mpi_get_lib_type() {
-    atl_mpi_lib_type_t lib_type = ATL_MPI_LIB_NONE;
+atl_mpi_lib_attr_t atl_mpi_get_lib_attr() {
+    atl_mpi_lib_attr_t lib_attr = { ATL_MPI_LIB_NONE, 0 };
+
     char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING] = { 0 };
     int mpi_version_len = -1, i;
     atl_mpi_lib_info_t* final_info = NULL;
@@ -548,7 +567,7 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() {
     if ((ret != MPI_SUCCESS) || (mpi_version_len < 0) ||
         (mpi_version_len > MPI_MAX_LIBRARY_VERSION_STRING)) {
         LOG_WARN("can not retrieve MPI version, mpi_version_len ", mpi_version_len, ", ret", ret);
-        return ATL_MPI_LIB_NONE;
+        return lib_attr;
     }
 
     /* remove trailing spaces at the end for more compact log */
@@ -557,12 +576,25 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() {
 
     LOG_DEBUG("MPI version: ", mpi_version);
 
+    /* for filtering */
+    char* lib_type_env = getenv("CCL_ATL_MPI");
+
     for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) {
         atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]);
 
         if (info->type == ATL_MPI_LIB_NONE)
             continue;
 
+        if (lib_type_env) {
+            if (strcmp(lib_type_env, info->name)) {
+                LOG_DEBUG("library ", info->name, " is filtered out by user input ", lib_type_env);
+                continue;
+            }
+            else {
+                LOG_DEBUG("use lib_type = ", lib_type_env, " because it is requested explicitly");
+            }
+        }
+
         CCL_THROW_IF_NOT(info->version_prefix_1, "empty version_prefix_1");
         CCL_THROW_IF_NOT(info->min_version_value >= 0, "unexpected minimal version");
 
@@ -628,7 +660,6 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() {
                                  " (min version) ",
                                  (info->kind_value ? info->kind_value : ""),
                                  "\n");
-                        continue;
                     }
                 }
                 else {
@@ -649,50 +680,49 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() {
                       version_value,
                       ") is higher or equal to minimal expected version (",
                       info->min_version_value,
-                      ") "
-                      "and kind matches with expected kind");
-            break;
-        }
-    }
+                      ")");
 
-    /* user input has higher priority */
-    char* lib_type_env = NULL;
-    if ((lib_type_env = getenv("CCL_ATL_MPI")) != NULL) {
-        final_info = NULL;
-        for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) {
-            atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]);
+            lib_attr.type = final_info->type;
+            lib_attr.device_buf =
+                (final_info->min_device_buf_version_value >= version_value) ? 1 : 0;
 
-            if (!strcmp(lib_type_env, info->name)) {
-                final_info = info;
-                LOG_DEBUG("set lib_type = ", lib_type_env, " because it is requested explicitly");
-                break;
-            }
+            break;
         }
     }
 
     if (final_info) {
         LOG_DEBUG("MPI library type: ", final_info->name);
-        lib_type = final_info->type;
     }
     else {
         LOG_DEBUG("MPI library type: none");
-        lib_type = ATL_MPI_LIB_NONE;
     }
 
-    return lib_type;
+    return lib_attr;
 }
 
 size_t atl_mpi_get_ep_count(const atl_attr_t& attr) {
-    size_t mpi_ep_count = attr.ep_count;
-    if (attr.extra_ep)
-        mpi_ep_count += attr.extra_ep;
+    size_t mpi_ep_count = attr.in.ep_count;
+    if (attr.in.enable_extra_ep)
+        mpi_ep_count += attr.in.enable_extra_ep;
     return mpi_ep_count;
 }
 
+size_t atl_mpi_get_ep_idx(size_t ep_idx) {
+    size_t mpi_ep_idx = ep_idx;
+    if (global_data.extra_ep)
+        mpi_ep_idx += global_data.extra_ep;
+    return mpi_ep_idx;
+}
+
 /* set these knobs without detection of MPI library type */
 atl_status_t atl_mpi_set_base_env(const atl_attr_t& attr) {
     setenv("PSM2_MULTI_EP", "1", 0);
     setenv("FI_OFI_RXM_USE_HASH", "0", 0);
+
+#ifdef CCL_ENABLE_SYCL
+    setenv("FI_SHM_DISABLE_CMA", "1", 0);
+#endif /* CCL_ENABLE_SYCL */
+
     setenv("MPIR_CVAR_DEFAULT_THREAD_LEVEL", "MPI_THREAD_MULTIPLE", 0);
 
     /* request IMPI level append library kind into MPI_Get_library_version output */
@@ -701,24 +731,34 @@ atl_status_t atl_mpi_set_base_env(const atl_attr_t& attr) {
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t atl_mpi_set_impi_env(const atl_attr_t& attr) {
+atl_status_t atl_mpi_set_impi_env(const atl_attr_t& attr, const atl_mpi_lib_attr_t& lib_attr) {
     char ep_count_str[MPI_MAX_INFO_VAL] = { 0 };
     snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", atl_mpi_get_ep_count(attr));
 
+    if (attr.in.ep_count)
+        setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0);
+
+#ifdef CCL_ENABLE_SYCL
+    setenv("I_MPI_SHM_CMA", "0", 0);
+    if (attr.in.enable_device_buf && lib_attr.device_buf) {
+        setenv("I_MPI_OFFLOAD", "2", 0);
+        setenv("I_MPI_OFFLOAD_TOPOLIB", "l0", 0);
+        setenv("I_MPI_OFFLOAD_QUEUE_CACHE", "1", 0);
+        setenv("I_MPI_OFFLOAD_LIST_CACHE", "1", 0);
+        if (attr.in.ep_count > 1) {
+            /* try to set global lock level before vci level
+               because setenv is invoked with overwrite=0 */
+            setenv("I_MPI_THREAD_LOCK_LEVEL", "global", 0);
+        }
+    }
+#endif /* CCL_ENABLE_SYCL */
+
     setenv("I_MPI_THREAD_SPLIT", "1", 0);
     setenv("I_MPI_THREAD_RUNTIME", "generic", 0);
     setenv("I_MPI_THREAD_MAX", ep_count_str, 0);
     setenv("I_MPI_THREAD_ID_KEY", EP_IDX_KEY, 0);
     setenv("I_MPI_THREAD_LOCK_LEVEL", "vci", 0);
 
-    if (attr.ep_count)
-        setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0);
-
-    auto& env = ccl::global_data::env();
-    if (env.log_level >= ccl_log_level::info) {
-        setenv("I_MPI_DEBUG", "4", 0);
-    }
-
     return ATL_STATUS_SUCCESS;
 }
 
@@ -756,18 +796,20 @@ atl_status_t atl_mpi_set_mpich_env(const atl_attr_t& attr) {
     setenv("MPIR_CVAR_CH4_OFI_MAX_VCIS", ep_count_str, 0);
     setenv("MPIR_CVAR_CH4_ASYNC_PROGRESS_ID_KEY", EP_IDX_KEY, 0);
     setenv("MPIR_CVAR_CH4_OFI_ENABLE_SCALABLE_ENDPOINTS", "1", 0);
-    setenv("MPIR_CVAR_CH4_OFI_ENABLE_NIC_SELECTION", "1", 0);
+
+    if (attr.in.mnic_type != ATL_MNIC_NONE) {
+        setenv("MPIR_CVAR_CH4_OFI_ENABLE_NIC_SELECTION", "1", 0);
+        auto& env = ccl::global_data::env();
+        if (env.log_level >= ccl_log_level::info) {
+            setenv("MPIR_CVAR_CH4_OFI_DUMP_NIC_SETTINGS", "1", 0);
+        }
+    }
 
     setenv("FI_PSM2_DELAY", "0", 0);
     setenv("FI_PSM2_TIMEOUT", "0", 0);
     setenv("FI_PSM2_NAME_SERVER", "0", 0);
     setenv("HFI_NO_CPUAFFINITY", "1", 0);
 
-    auto& env = ccl::global_data::env();
-    if (env.log_level >= ccl_log_level::info) {
-        setenv("MPIR_CVAR_CH4_OFI_DUMP_NIC_SETTINGS", "1", 0);
-    }
-
     return ATL_STATUS_SUCCESS;
 }
 
@@ -781,12 +823,12 @@ atl_status_t atl_mpi_check_mpich_env(const atl_attr_t& attr) {
 }
 
 atl_status_t atl_mpi_set_env(const atl_attr_t& attr) {
-    if (global_data.mpi_lib_type != ATL_MPI_LIB_NONE) {
+    if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_NONE) {
         /* library type was already detected and env was set, make sanity check */
-        if (global_data.mpi_lib_type == ATL_MPI_LIB_IMPI) {
+        if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
             return atl_mpi_check_impi_env(attr);
         }
-        else if (global_data.mpi_lib_type == ATL_MPI_LIB_MPICH) {
+        else if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
             return atl_mpi_check_mpich_env(attr);
         }
         return ATL_STATUS_SUCCESS;
@@ -794,18 +836,17 @@ atl_status_t atl_mpi_set_env(const atl_attr_t& attr) {
 
     atl_mpi_set_base_env(attr);
 
-    atl_mpi_lib_type_t type = atl_mpi_get_lib_type();
+    atl_mpi_lib_attr_t mpi_lib_attr = atl_mpi_get_lib_attr();
 
-    if (type == ATL_MPI_LIB_NONE) {
-        /* nothing to do */
+    if (mpi_lib_attr.type == ATL_MPI_LIB_NONE) {
         return ATL_STATUS_SUCCESS;
     }
 
-    if (type == ATL_MPI_LIB_IMPI) {
-        atl_mpi_set_impi_env(attr);
+    if (mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
+        atl_mpi_set_impi_env(attr, mpi_lib_attr);
         atl_mpi_check_impi_env(attr);
     }
-    else if (type == ATL_MPI_LIB_MPICH) {
+    else if (mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
         atl_mpi_set_mpich_env(attr);
         atl_mpi_check_mpich_env(attr);
     }
@@ -819,7 +860,7 @@ atl_status_t atl_mpi_set_env(const atl_attr_t& attr) {
         LOG_DEBUG("set CCL-MPI specific environment");
     }
 
-    global_data.mpi_lib_type = type;
+    global_data.mpi_lib_attr = mpi_lib_attr;
 
     return ATL_STATUS_SUCCESS;
 }
@@ -838,9 +879,6 @@ atl_mpi_comm_info_t atl_mpi_get_comm_info(MPI_Comm comm, const char* key) {
 }
 
 size_t atl_mpi_get_nic_count(const char* nic_count_key) {
-    if (global_data.mpi_lib_type != ATL_MPI_LIB_MPICH)
-        return 1;
-
     size_t count = 1;
     atl_mpi_comm_info_t info = atl_mpi_get_comm_info(MPI_COMM_WORLD, nic_count_key);
     CCL_THROW_IF_NOT(info.found, "MPI comm key ", nic_count_key, " was not set");
@@ -867,7 +905,7 @@ void atl_mpi_check_comm_info(MPI_Comm comm, const char* key, const char* expecte
 }
 
 void atl_mpi_check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) {
-    if (global_data.mpi_lib_type == ATL_MPI_LIB_NONE)
+    if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE)
         return;
 
     char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
@@ -876,9 +914,6 @@ void atl_mpi_check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) {
 }
 
 void atl_mpi_check_comm_nic_idx(MPI_Comm comm, size_t expected_idx, const char* nic_idx_key) {
-    if (global_data.mpi_lib_type != ATL_MPI_LIB_MPICH)
-        return;
-
     char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
     snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx);
     atl_mpi_check_comm_info(comm, nic_idx_key, expected_idx_str);
@@ -887,7 +922,7 @@ void atl_mpi_check_comm_nic_idx(MPI_Comm comm, size_t expected_idx, const char*
 #ifdef ENABLE_DEBUG
 inline void atl_mpi_check_ep(atl_ep_t* ep) {
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, ep->idx);
+    atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, atl_mpi_get_ep_idx(ep->idx));
 }
 #else
 #define atl_mpi_check_ep(ep)
@@ -1090,6 +1125,7 @@ static atl_status_t atl_mpi_ep_allreduce(atl_ep_t* ep,
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
     else {
+        //printf("atl_mpi: send_buf %p, recv_buf %p\n", send_buf, recv_buf);
         ret = MPI_Iallreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                              recv_buf,
                              count,
@@ -1441,13 +1477,14 @@ static atl_comp_ops_t atl_mpi_ep_comp_ops = { .wait = atl_mpi_ep_wait,
 
 static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t** ep) {
     int ret;
-    ssize_t mpi_ep_idx = idx;
 
-    /* select NIC index from local NICs only */
-    size_t nic_idx = (idx % global_data.close_nic_count);
+    ssize_t mpi_ep_idx = atl_mpi_get_ep_idx(idx);
+    char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 };
 
+    size_t nic_idx = 0;
     char nic_idx_str[MPI_MAX_INFO_VAL] = { 0 };
-    char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 };
+    const char* nic_idx_key =
+        (global_data.mnic_type == ATL_MNIC_GLOBAL) ? GLOBAL_NIC_IDX_KEY : LOCAL_NIC_IDX_KEY;
 
     atl_mpi_ep_t* mpi_ep = (atl_mpi_ep_t*)calloc(1, sizeof(atl_mpi_ep_t));
     if (!mpi_ep)
@@ -1460,16 +1497,17 @@ static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t
     MPI_Info info;
     MPI_Info_create(&info);
 
-    /* set NIC index */
-    snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx);
-    MPI_Info_set(info, CLOSE_NIC_IDX_KEY, nic_idx_str);
-
     /* set EP index */
-    if (global_data.extra_ep)
-        mpi_ep_idx += global_data.extra_ep;
     snprintf(mpi_ep_idx_str, MPI_MAX_INFO_VAL, "%zu", mpi_ep_idx);
     MPI_Info_set(info, EP_IDX_KEY, mpi_ep_idx_str);
 
+    if (global_data.mnic_type != ATL_MNIC_NONE) {
+        /* set NIC index */
+        nic_idx = (idx % global_data.mnic_count);
+        snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx);
+        MPI_Info_set(info, nic_idx_key, nic_idx_str);
+    }
+
     MPI_Comm_set_info(mpi_ep->mpi_comm, info);
 
     if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) {
@@ -1479,14 +1517,18 @@ static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t
         MPI_Comm_set_info(mpi_ep->dummy_comm, info);
         MPI_Irecv(NULL, 0, MPI_CHAR, 0, 0, mpi_ep->dummy_comm, &(mpi_ep->dummy_req.native_req));
 
-        atl_mpi_check_comm_nic_idx(mpi_ep->dummy_comm, nic_idx, CLOSE_NIC_IDX_KEY);
         atl_mpi_check_comm_ep_idx(mpi_ep->dummy_comm, mpi_ep_idx);
+        if (global_data.mnic_type != ATL_MNIC_NONE) {
+            atl_mpi_check_comm_nic_idx(mpi_ep->dummy_comm, nic_idx, nic_idx_key);
+        }
     }
 
     MPI_Info_free(&info);
 
-    atl_mpi_check_comm_nic_idx(mpi_ep->mpi_comm, nic_idx, CLOSE_NIC_IDX_KEY);
     atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, mpi_ep_idx);
+    if (global_data.mnic_type != ATL_MNIC_NONE) {
+        atl_mpi_check_comm_nic_idx(mpi_ep->mpi_comm, nic_idx, nic_idx_key);
+    }
 
     LOG_DEBUG("atl-mpi-ep: ", idx, ", ep_idx ", mpi_ep_idx, ", nic_idx ", nic_idx);
 
@@ -1541,7 +1583,7 @@ static atl_status_t atl_mpi_init(int* argc,
         if (!global_data.is_external_init) {
             ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level);
             if (provided_thread_level < required_thread_level) {
-                LOG_ERROR("unexpected MPI thread level: requested ",
+                LOG_ERROR("unexpected MPI thread level: required ",
                           required_thread_level,
                           ", provided ",
                           provided_thread_level);
@@ -1552,23 +1594,40 @@ static atl_status_t atl_mpi_init(int* argc,
             LOG_DEBUG("MPI was initialized externaly");
             MPI_Query_thread(&provided_thread_level);
             if (provided_thread_level < required_thread_level) {
-                LOG_ERROR("MPI was initialized externaly but with unexpected thread level: "
-                          "requested ",
-                          required_thread_level,
-                          ", provided ",
-                          provided_thread_level);
-                goto err_init;
+                LOG_WARN("MPI was initialized externaly but with unexpected thread level: "
+                         "required ",
+                         required_thread_level,
+                         ", provided ",
+                         provided_thread_level);
             }
         }
 
         if (ret)
             goto err_init;
 
-        if (global_data.mpi_lib_type == ATL_MPI_LIB_NONE)
-            global_data.mpi_lib_type = atl_mpi_get_lib_type();
-        global_data.extra_ep = attr->extra_ep;
-        global_data.nic_count = atl_mpi_get_nic_count(NIC_COUNT_KEY);
-        global_data.close_nic_count = atl_mpi_get_nic_count(CLOSE_NIC_COUNT_KEY);
+        if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE)
+            global_data.mpi_lib_attr = atl_mpi_get_lib_attr();
+
+        global_data.extra_ep = attr->in.enable_extra_ep;
+
+        global_data.mnic_type = attr->in.mnic_type;
+        if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_MPICH) {
+            /* only MPICH supports multi-NIC */
+            global_data.mnic_type = ATL_MNIC_NONE;
+        }
+
+        if (global_data.mnic_type == ATL_MNIC_LOCAL) {
+            global_data.mnic_count = atl_mpi_get_nic_count(LOCAL_NIC_COUNT_KEY);
+        }
+        else if (global_data.mnic_type == ATL_MNIC_GLOBAL) {
+            global_data.mnic_count = atl_mpi_get_nic_count(GLOBAL_NIC_IDX_KEY);
+        }
+        else if (global_data.mnic_type == ATL_MNIC_NONE) {
+            global_data.mnic_count = 1;
+        }
+        global_data.mnic_count = std::min(global_data.mnic_count, attr->in.mnic_count);
+        global_data.mnic_count = std::min(global_data.mnic_count, attr->in.ep_count);
+        global_data.mnic_count = std::max(global_data.mnic_count, (size_t)(1));
 
         if (atl_mpi_bf16_init() == ATL_STATUS_FAILURE) {
             atl_mpi_bf16_finalize();
@@ -1597,8 +1656,8 @@ static atl_status_t atl_mpi_init(int* argc,
 
     ctx->ops = &atl_mpi_ops;
     ctx->mr_ops = &atl_mpi_mr_ops;
-    ctx->ep_count = attr->ep_count;
-    ctx->eps = (atl_ep_t**)calloc(1, sizeof(void*) * attr->ep_count);
+    ctx->ep_count = attr->in.ep_count;
+    ctx->eps = (atl_ep_t**)calloc(1, sizeof(void*) * attr->in.ep_count);
     if (!ctx->eps)
         goto err_after_init;
 
@@ -1610,23 +1669,25 @@ static atl_status_t atl_mpi_init(int* argc,
     else {
         mpi_ctx->progress_mode = ATL_PROGRESS_CHECK;
     }
-    mpi_ctx->sync_coll = attr->sync_coll;
+    mpi_ctx->sync_coll = attr->in.enable_sync_coll;
 
     if (coord->global_idx == 0) {
         if (global_data.ctx_count == 1) {
             LOG_INFO("atl-mpi-global:")
             LOG_INFO("  is_external_init: ", global_data.is_external_init);
-            LOG_INFO("  mpi_lib_type: ", mpi_lib_infos[global_data.mpi_lib_type].name);
+            LOG_INFO("  mpi_lib_attr.type: ", mpi_lib_infos[global_data.mpi_lib_attr.type].name);
+            LOG_INFO("  mpi_lib_attr.device_buf: ", global_data.mpi_lib_attr.device_buf);
             LOG_INFO("  extra_ep: ", global_data.extra_ep);
-            LOG_INFO("  nic_count: ", global_data.nic_count);
-            LOG_INFO("  close_nic_count: ", global_data.close_nic_count);
+            LOG_INFO("  mnic_type: ", global_data.mnic_type);
+            if (global_data.mnic_type != ATL_MNIC_NONE)
+                LOG_INFO("  mnic_count: ", global_data.mnic_count);
         }
         LOG_INFO("atl-mpi-ctx: ", (global_data.ctx_count - 1));
         LOG_INFO("  progress_mode: ", mpi_ctx->progress_mode);
         LOG_INFO("  sync_coll: ", mpi_ctx->sync_coll);
     }
 
-    for (i = 0; i < attr->ep_count; i++) {
+    for (i = 0; i < attr->in.ep_count; i++) {
         ret = atl_mpi_ep_init(mpi_ctx, i, &(ctx->eps[i]));
         if (ret)
             goto err_ep_dup;
@@ -1636,15 +1697,20 @@ static atl_status_t atl_mpi_init(int* argc,
 
     MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_ptr, &is_tag_ub_set);
 
-    attr->tag_bits = 32;
-    attr->max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0;
-    attr->enable_rma = 0;
-    attr->max_order_waw_size = 0;
+    /* report actual attributes back to upper level */
+    attr->out.enable_shm = 0;
+    attr->out.enable_rma = 0;
+    attr->out.enable_device_buf = attr->in.enable_device_buf & global_data.mpi_lib_attr.device_buf;
+    attr->out.mnic_type = global_data.mnic_type;
+    attr->out.mnic_count = global_data.mnic_count;
+    attr->out.tag_bits = 32;
+    attr->out.max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0;
+    attr->out.max_order_waw_size = 0;
 
     return ATL_STATUS_SUCCESS;
 
 err_ep_dup:
-    for (i = 0; i < attr->ep_count; i++) {
+    for (i = 0; i < attr->in.ep_count; i++) {
         atl_mpi_ep_t* mpi_ep = container_of(ctx->eps[i], atl_mpi_ep_t, ep);
 
         if (ctx->eps[i] && mpi_ep) {
diff --git a/src/atl/ofi/atl_ofi_impl.cpp b/src/atl/ofi/atl_ofi_impl.cpp
index 77f8391d5..ea34fef84 100644
--- a/src/atl/ofi/atl_ofi_impl.cpp
+++ b/src/atl/ofi/atl_ofi_impl.cpp
@@ -14,12 +14,14 @@
  limitations under the License.
 */
 #include <assert.h>
+#include <dlfcn.h>
 #include <inttypes.h>
 #include <math.h>
 #include <rdma/fabric.h>
 #include <rdma/fi_cm.h>
 #include <rdma/fi_tagged.h>
 #include <rdma/fi_rma.h>
+#include <sstream>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -30,23 +32,30 @@
 #include <errno.h>
 
 #include "atl.h"
+#include "hwloc/hwloc_wrapper.h"
+
+#define ATL_OFI_BASE_PM_KEY     "atl-ofi"
+#define ATL_OFI_FI_ADDR_PM_KEY  ATL_OFI_BASE_PM_KEY "-fiaddr"
+#define ATL_OFI_HOSTNAME_PM_KEY ATL_OFI_BASE_PM_KEY "-hostname"
 
-#define ATL_OFI_BASE_PM_KEY         "atl-ofi"
-#define ATL_OFI_FI_ADDR_PM_KEY      ATL_OFI_BASE_PM_KEY "-fiaddr"
-#define ATL_OFI_HOSTNAME_PM_KEY     ATL_OFI_BASE_PM_KEY "-hostname"
 #define ATL_OFI_TIMEOUT_SEC_ENV     "ATL_OFI_TIMEOUT_SEC"
 #define ATL_OFI_MAX_RETRY_COUNT_ENV "ATL_OFI_MAX_RETRY_COUNT"
+
 #define ATL_OFI_DEFAULT_TIMEOUT_SEC 60
 #define ATL_OFI_MAX_RETRY_COUNT     10000
 #define ATL_OFI_MAX_HOSTNAME_LEN    64
 #define ATL_OFI_WAIT_SEC            10
 #define ATL_OFI_CQ_READ_ITERS       10000
 #define ATL_OFI_CQ_BUNCH_SIZE       8
+
 #define ATL_OFI_MAX_PROV_ENV_LEN    128
 #define ATL_OFI_PMI_PROV_MULTIPLIER 100
 #define ATL_OFI_PMI_PROC_MULTIPLIER (ATL_OFI_PMI_PROV_MULTIPLIER * 10)
-#define ATL_OFI_MAX_PROV_COUNT      2 /* NW and SHM providers */
-#define ATL_OFI_SHM_PROV_NAME       "shm"
+#define ATL_OFI_MAX_NW_PROV_COUNT   32
+#define ATL_OFI_MAX_PROV_COUNT      (ATL_OFI_MAX_NW_PROV_COUNT + 1) /* NW and SHM providers */
+#define ATL_OFI_MAX_ACTIVE_PROV_COUNT \
+    2 /* by current scheme each EP may use only SHM and 1 NW prov */
+#define ATL_OFI_SHM_PROV_NAME "shm"
 
 #ifndef PRId64
 #define PRId64 "lld"
@@ -150,6 +159,7 @@ typedef struct {
 } atl_ofi_prov_ep_t;
 
 typedef struct {
+    size_t idx;
     struct fi_info* info;
     struct fid_fabric* fabric;
     struct fid_domain* domain;
@@ -167,11 +177,15 @@ typedef struct {
     fi_addr_t* addr_table;
     size_t addr_len;
     int first_proc_idx;
-
 } atl_ofi_prov_t;
 
 typedef struct {
     atl_ep_t ep;
+
+    /* used to make progressing only for really used providers */
+    size_t active_prov_count;
+    size_t active_prov_idxs[ATL_OFI_MAX_ACTIVE_PROV_COUNT];
+
 } atl_ofi_ep_t;
 
 typedef struct {
@@ -179,10 +193,13 @@ typedef struct {
     pm_rt_desc_t* pm_rt;
     atl_ofi_prov_t provs[ATL_OFI_MAX_PROV_COUNT];
     size_t prov_count;
+    size_t nw_prov_count;
+    size_t nw_prov_first_idx;
     size_t shm_prov_idx;
-    size_t nw_prov_idx;
     size_t max_retry_count;
     atl_progress_mode_t progress_mode;
+    atl_mnic_t mnic_type;
+    size_t mnic_count;
 } atl_ofi_ctx_t;
 
 typedef struct {
@@ -196,9 +213,10 @@ typedef struct {
 typedef struct atl_ofi_global_data {
     size_t ctx_count;
     int is_env_inited;
+    void* dlhandle;
     char prov_env_copy[ATL_OFI_MAX_PROV_ENV_LEN];
 
-    atl_ofi_global_data() : ctx_count(0), is_env_inited(0) {
+    atl_ofi_global_data() : ctx_count(0), is_env_inited(0), dlhandle(NULL) {
         memset(prov_env_copy, 0, sizeof(prov_env_copy));
     }
 } atl_ofi_global_data_t;
@@ -217,28 +235,48 @@ static void atl_ofi_print_coord(atl_proc_coord_t* coord) {
               "]");
 }
 
+static std::string atl_ofi_get_nic_name(const struct fi_info* prov) {
+    std::stringstream ss;
+    //ss << prov->fabric_attr->prov_name << ":" << prov->fabric_attr->name << ":" << prov->domain_attr->name;
+    ss << prov->fabric_attr->prov_name << ":" << prov->domain_attr->name;
+    return ss.str();
+}
+
 static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_size) {
     size_t prov_idx;
     atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
 
-    if (ofi_ctx->prov_count == 1) {
-        prov_idx = 0;
-    }
-    else {
-        CCL_THROW_IF_NOT(ofi_ctx->prov_count == ATL_OFI_MAX_PROV_COUNT,
-                         "unexpected prov_count ",
-                         ofi_ctx->prov_count);
+    CCL_THROW_IF_NOT(ofi_ctx->prov_count <= ATL_OFI_MAX_PROV_COUNT,
+                     "unexpected prov_count ",
+                     ofi_ctx->prov_count);
+
+    atl_proc_coord_t* coord = &(ep->ctx->coord);
+    int my_node_idx = coord->global_idx / coord->local_count;
+    int peer_node_idx = peer_proc_idx / coord->local_count;
 
-        atl_proc_coord_t* coord = &(ep->ctx->coord);
-        int my_node_idx = coord->global_idx / coord->local_count;
-        int peer_node_idx = peer_proc_idx / coord->local_count;
+    int has_shm = (ofi_ctx->prov_count == ofi_ctx->nw_prov_count + 1) ? 1 : 0;
 
-        if ((my_node_idx == peer_node_idx) &&
-            (msg_size <= ofi_ctx->provs[ofi_ctx->shm_prov_idx].max_msg_size))
-            prov_idx = ofi_ctx->shm_prov_idx;
-        else
-            prov_idx = ofi_ctx->nw_prov_idx;
+    if (has_shm && (my_node_idx == peer_node_idx) &&
+        (msg_size <= ofi_ctx->provs[ofi_ctx->shm_prov_idx].max_msg_size)) {
+        prov_idx = ofi_ctx->shm_prov_idx;
     }
+    else {
+        size_t nw_prov_offset = ep->idx % ofi_ctx->nw_prov_count;
+        prov_idx = ofi_ctx->nw_prov_first_idx + nw_prov_offset;
+    }
+
+    LOG_DEBUG("get_prov: ep_idx ",
+              ep->idx,
+              ", prov_idx ",
+              prov_idx,
+              ", my_node_idx ",
+              my_node_idx,
+              ", peer_node_idx ",
+              peer_node_idx,
+              ", msg_size ",
+              msg_size,
+              ", has_shm ",
+              has_shm);
 
     /* TODO: add segmentation logic */
     CCL_THROW_IF_NOT(msg_size <= ofi_ctx->provs[prov_idx].max_msg_size,
@@ -377,7 +415,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
         return ATL_STATUS_SUCCESS;
 
     LOG_DEBUG("name ",
-              prov->info->fabric_attr->prov_name,
+              atl_ofi_get_nic_name(prov->info),
               ", is_shm ",
               prov->is_shm,
               ", addr_len ",
@@ -791,7 +829,7 @@ static atl_status_t atl_ofi_prov_ep_init(atl_ofi_prov_t* prov, size_t ep_idx) {
 
 err:
     atl_ofi_prov_ep_destroy(prov, ep);
-    return RET2ATL(ret);
+    return ATL_STATUS_FAILURE;
 }
 
 static atl_status_t atl_ofi_try_to_drain_cq_err(struct fid_cq* cq) {
@@ -903,7 +941,7 @@ static atl_status_t atl_ofi_adjust_env(const atl_attr_t& attr) {
         memcpy(global_data.prov_env_copy, prov_env, strlen(prov_env));
     }
 
-    if (attr.enable_shm) {
+    if (attr.in.enable_shm) {
         /* add shm provider in the list of allowed providers */
         if (prov_env && !strstr(prov_env, ATL_OFI_SHM_PROV_NAME)) {
             /* whether single provider will be in the final env variable */
@@ -925,7 +963,7 @@ static atl_status_t atl_ofi_adjust_env(const atl_attr_t& attr) {
                 snprintf(prov_env_new, prov_env_new_size, "%s,%s", prov_env, ATL_OFI_SHM_PROV_NAME);
             }
 
-            LOG_INFO("ATL/SHM is requested, modify FI_PROVIDER: old value: ",
+            LOG_INFO("atl-ofi-shm is requested, modify FI_PROVIDER: old value: ",
                      prov_env,
                      ", new value: ",
                      prov_env_new);
@@ -951,6 +989,23 @@ static atl_status_t atl_ofi_set_env(const atl_attr_t& attr) {
     setenv("HFI_NO_CPUAFFINITY", "1", 0);
     setenv("PSM2_MULTI_EP", "1", 0);
 
+    setenv("FI_PSM3_DELAY", "0", 0);
+    setenv("FI_PSM3_TIMEOUT", "0", 0);
+    setenv("FI_PSM3_LOCK_LEVEL", "1", 0);
+    setenv("FI_PSM3_NAME_SERVER", "0", 0);
+    setenv("PSM3_NO_CPUAFFINITY", "1", 0);
+    setenv("PSM3_RDMA", "2", 0);
+    setenv("PSM3_MR_CACHE_MODE", "0", 0); //TODO temporary
+    setenv("PSM3_MULTI_EP", "1", 0);
+    if (attr.in.mnic_type == ATL_MNIC_NONE)
+        setenv("PSM3_NIC", "any", 0);
+
+    char* hydra_uuid_env = getenv("I_MPI_HYDRA_UUID");
+    if (hydra_uuid_env) {
+        setenv("FI_PSM2_UUID", hydra_uuid_env, 0);
+        setenv("FI_PSM3_UUID", hydra_uuid_env, 0);
+    }
+
     setenv("FI_OFI_RXM_USE_HASH", "0", 0);
     setenv("FI_OFI_RXM_RX_SIZE", "8192", 0);
     setenv("FI_OFI_RXM_TX_SIZE", "8192", 0);
@@ -960,8 +1015,22 @@ static atl_status_t atl_ofi_set_env(const atl_attr_t& attr) {
     setenv("FI_SHM_TX_SIZE", "8192", 0);
     setenv("FI_SHM_RX_SIZE", "8192", 0);
 
+#ifdef CCL_ENABLE_SYCL
+    setenv("FI_SHM_DISABLE_CMA", "1", 0);
+#endif /* CCL_ENABLE_SYCL */
+
     atl_ofi_adjust_env(attr);
 
+    /*
+       load libfabric symbols into global namespace
+       to workaround issue with undefined symbols
+       in case of out-of-tree providers, like OFI/PSM3
+    */
+    global_data.dlhandle = dlopen("libfabric.so", RTLD_GLOBAL | RTLD_NOW);
+    if (global_data.dlhandle == NULL) {
+        CCL_THROW("dlopen (libfabric.so): ", dlerror());
+    }
+
     global_data.is_env_inited = 1;
 
     return ATL_STATUS_SUCCESS;
@@ -989,6 +1058,14 @@ static atl_status_t atl_ofi_finalize(atl_ctx_t* ctx) {
     }
 
     if (global_data.ctx_count == 0) {
+        if (global_data.dlhandle) {
+            dlclose(global_data.dlhandle);
+        }
+
+        if (hwloc_is_initialized()) {
+            CCL_THROW_IF_NOT(hwloc_finalize() == HWLOC_SUCCESS, "failed to finalize hwloc");
+        }
+
         if (ctx->coord.global_idx == 0) {
             LOG_INFO("finalized last atl-ofi ctx");
         }
@@ -1420,17 +1497,14 @@ static inline atl_status_t atl_ofi_ep_progress(atl_ep_t* ep, atl_ofi_req_t* req
     ssize_t ret;
     size_t idx;
     struct fi_cq_tagged_entry entries[ATL_OFI_CQ_BUNCH_SIZE];
+    atl_ofi_ep_t* ofi_ep = container_of(ep, atl_ofi_ep_t, ep);
+    atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
+    size_t ep_idx = ep->idx;
 
-    atl_ofi_ctx_t* ofi_ctx;
-    size_t ep_idx;
-
-    ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
-    ep_idx = ep->idx;
-
-    /* ensure progress for all providers */
-    for (idx = 0; idx < ofi_ctx->prov_count; idx++) {
+    /* ensure progress for all active providers */
+    for (idx = 0; idx < ofi_ep->active_prov_count; idx++) {
         atl_ofi_prov_ep_t* prov_ep;
-        prov_ep = &(ofi_ctx->provs[idx].eps[ep_idx]);
+        prov_ep = &(ofi_ctx->provs[ofi_ep->active_prov_idxs[idx]].eps[ep_idx]);
         do {
             ret = fi_cq_read(prov_ep->cq, entries, ATL_OFI_CQ_BUNCH_SIZE);
             if (ret > 0)
@@ -1513,27 +1587,297 @@ static atl_comp_ops_t atl_ofi_ep_comp_ops = { .wait = atl_ofi_ep_wait,
                                               .poll = atl_ofi_ep_poll,
                                               .check = atl_ofi_ep_check };
 
+static atl_status_t atl_ofi_get_prov_list(atl_ctx_t* ctx,
+                                          const char* prov_name,
+                                          struct fi_info* base_hints,
+                                          struct fi_info** out_prov_list) {
+    struct fi_info* hints = NULL;
+    struct fi_info* prov_list = NULL;
+    ssize_t ret = 0;
+    int fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION);
+    const char* prov_name_str = (prov_name) ? prov_name : "<default>";
+
+    hints = fi_dupinfo(base_hints);
+    if (!hints) {
+        LOG_ERROR("fi_dupinfo error");
+        goto err;
+    }
+
+    *out_prov_list = NULL;
+
+    LOG_DEBUG("request providers with name: ", prov_name_str);
+
+    hints->fabric_attr->prov_name = (prov_name) ? strdup(prov_name) : NULL;
+
+    ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &prov_list);
+    if (ret || !prov_list) {
+        LOG_ERROR("fi_getinfo error: ret ", ret, ", providers ", (void*)prov_list);
+        goto err;
+    }
+
+    if (prov_list->domain_attr->max_ep_tx_ctx > 1) {
+        hints->ep_attr->tx_ctx_cnt = ctx->ep_count;
+        hints->ep_attr->rx_ctx_cnt = ctx->ep_count;
+    }
+    else {
+        hints->ep_attr->tx_ctx_cnt = 1;
+        hints->ep_attr->rx_ctx_cnt = 1;
+    }
+
+    fi_freeinfo(prov_list);
+    prov_list = NULL;
+
+    ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints, &prov_list);
+    if (ret || !prov_list) {
+        LOG_ERROR("fi_getinfo error, prov_name ", prov_name_str);
+        goto err;
+    }
+
+    fi_freeinfo(hints);
+    hints = NULL;
+
+    *out_prov_list = prov_list;
+    return ATL_STATUS_SUCCESS;
+
+err:
+    LOG_ERROR("can't create providers for name ", prov_name_str);
+    return ATL_STATUS_FAILURE;
+}
+
+static atl_status_t atl_ofi_prov_init(atl_ctx_t* ctx,
+                                      struct fi_info* info,
+                                      atl_ofi_prov_t* prov,
+                                      ipmi* pmi) {
+    struct fi_av_attr av_attr;
+    size_t ep_idx = 0;
+    ssize_t ret = 0;
+
+    memset(&av_attr, 0, sizeof(av_attr));
+
+    atl_ofi_ctx_t* ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx);
+
+    if (ctx->coord.global_idx == 0) {
+        LOG_INFO("provider: ", info->fabric_attr->prov_name);
+        LOG_INFO("  nic: ", atl_ofi_get_nic_name(info));
+        LOG_INFO("  mr_mode: ", info->domain_attr->mr_mode);
+        LOG_INFO("  threading: ", info->domain_attr->threading);
+        LOG_INFO("  tx_ctx_cnt: ", info->domain_attr->tx_ctx_cnt);
+        LOG_INFO("  max_ep_tx_ctx: ", info->domain_attr->max_ep_tx_ctx);
+        LOG_INFO("  max_msg_size: ", info->ep_attr->max_msg_size);
+    }
+
+    prov->info = fi_dupinfo(info);
+
+    if (!prov->info) {
+        LOG_ERROR("fi_dupinfo error");
+        goto err;
+    }
+
+    prov->max_msg_size = info->ep_attr->max_msg_size;
+
+    ATL_OFI_CALL(fi_fabric(info->fabric_attr, &prov->fabric, NULL), ret, goto err);
+
+    ATL_OFI_CALL(fi_domain(prov->fabric, info, &prov->domain, NULL), ret, goto err);
+
+    av_attr.type = FI_AV_TABLE;
+    av_attr.rx_ctx_bits = prov->rx_ctx_bits = (int)ceil(log2(prov->info->ep_attr->rx_ctx_cnt));
+
+    ATL_OFI_CALL(fi_av_open(prov->domain, &av_attr, &prov->av, NULL), ret, goto err);
+
+    if (info->domain_attr->max_ep_tx_ctx > 1) {
+        ATL_OFI_CALL(fi_scalable_ep(prov->domain, info, &prov->sep, NULL), ret, goto err);
+        ATL_OFI_CALL(fi_scalable_ep_bind(prov->sep, &prov->av->fid, 0), ret, goto err);
+    }
+
+    prov->eps = (atl_ofi_prov_ep_t*)calloc(1, sizeof(atl_ofi_prov_ep_t) * ctx->ep_count);
+    if (!prov->eps) {
+        LOG_ERROR("can't allocate prov->eps");
+        goto err;
+    }
+
+    for (ep_idx = 0; ep_idx < ctx->ep_count; ep_idx++) {
+        ret = atl_ofi_prov_ep_init(prov, ep_idx);
+        if (ret) {
+            LOG_ERROR("atl_ofi_prov_ep_init error");
+            goto err;
+        }
+    }
+
+    if (prov->sep) {
+        fi_enable(prov->sep);
+    }
+
+    /* TODO: make separate function to be called on CCL comm creation */
+    ret = atl_ofi_prov_eps_connect(ofi_ctx, prov->idx, pmi);
+    if (ret) {
+        LOG_ERROR("atl_ofi_prov_eps_connect error, prov_idx ", prov->idx);
+        goto err;
+    }
+
+    return ATL_STATUS_SUCCESS;
+
+err:
+    LOG_ERROR("can't init provider ", atl_ofi_get_nic_name(info));
+    return ATL_STATUS_FAILURE;
+}
+
+/* determine if NIC has already been included in others */
+static int atl_ofi_nic_already_used(const struct fi_info* prov,
+                                    struct fi_info** others,
+                                    size_t nic_count) {
+    for (size_t i = 0; i < nic_count; i++) {
+        if (prov->nic->bus_attr->bus_type == FI_BUS_PCI &&
+            others[i]->nic->bus_attr->bus_type == FI_BUS_PCI) {
+            struct fi_pci_attr pci = prov->nic->bus_attr->attr.pci;
+            struct fi_pci_attr other_pci = others[i]->nic->bus_attr->attr.pci;
+            LOG_DEBUG("compare nic ",
+                      prov->fabric_attr->prov_name,
+                      " pci ",
+                      (int)pci.domain_id,
+                      ":",
+                      (int)pci.bus_id,
+                      ":",
+                      (int)pci.device_id,
+                      ":",
+                      (int)pci.function_id,
+                      " with nic ",
+                      others[i]->fabric_attr->prov_name,
+                      " pci ",
+                      (int)other_pci.domain_id,
+                      ":",
+                      (int)other_pci.bus_id,
+                      ":",
+                      (int)other_pci.device_id,
+                      ":",
+                      (int)other_pci.function_id);
+            if (pci.domain_id == other_pci.domain_id && pci.bus_id == other_pci.bus_id &&
+                pci.device_id == other_pci.device_id && pci.function_id == other_pci.function_id)
+                return 1;
+        }
+        else {
+            LOG_DEBUG("compare nic ",
+                      atl_ofi_get_nic_name(prov),
+                      " with nic ",
+                      atl_ofi_get_nic_name(others[i]));
+            if (!strcmp(prov->domain_attr->name, others[i]->domain_attr->name))
+                return 1;
+        }
+    }
+    return 0;
+}
+
+/* return true if the NIC is bound to the same socket as calling process */
+static int atl_ofi_is_nic_local(struct fi_info* info) {
+    if (info->nic->bus_attr->bus_type == FI_BUS_PCI) {
+        struct fi_pci_attr pci = info->nic->bus_attr->attr.pci;
+        return hwloc_is_dev_close_by_pci(pci.domain_id, pci.bus_id, pci.device_id, pci.function_id);
+    }
+    return 0;
+}
+
+static atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx, struct fi_info* base_hints, ipmi* pmi) {
+    struct fi_info* prov_list = NULL;
+    size_t idx = 0, prov_idx = 0;
+    char* prov_name = NULL;
+    atl_ofi_prov_t* prov = NULL;
+
+    atl_ofi_ctx_t* ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx);
+
+    if (strlen(global_data.prov_env_copy) && !strstr(global_data.prov_env_copy, ","))
+        prov_name = global_data.prov_env_copy;
+    else
+        prov_name = NULL;
+
+    ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err);
+
+    if (ofi_ctx->mnic_type == ATL_MNIC_NONE) {
+        prov_idx = ofi_ctx->nw_prov_first_idx;
+        prov = &ofi_ctx->provs[prov_idx];
+        prov->idx = prov_idx;
+        prov->is_shm = 0;
+        ATL_CALL(atl_ofi_prov_init(ctx, prov_list, prov, pmi), goto err);
+        ofi_ctx->nw_prov_count++;
+    }
+    else {
+        /* calculate the number of NICs */
+        struct fi_info* prov_iter = prov_list;
+        struct fi_info* filtered_prov_list[ATL_OFI_MAX_NW_PROV_COUNT];
+        size_t nic_count = 0;
+        struct fid_nic* nic = NULL;
+
+        while (prov_iter && (nic_count < ofi_ctx->mnic_count)) {
+            nic = prov_iter->nic;
+            if (nic) {
+                LOG_DEBUG("check nic ", atl_ofi_get_nic_name(prov_iter));
+                if (!atl_ofi_nic_already_used(prov_iter, filtered_prov_list, nic_count)) {
+                    int is_local = atl_ofi_is_nic_local(prov_iter);
+                    LOG_DEBUG("nic ", atl_ofi_get_nic_name(prov_iter), ", is_local ", is_local);
+
+                    if (ofi_ctx->mnic_type == ATL_MNIC_GLOBAL ||
+                        (ofi_ctx->mnic_type == ATL_MNIC_LOCAL && is_local)) {
+                        LOG_INFO("found suitable nic ", atl_ofi_get_nic_name(prov_iter));
+                        filtered_prov_list[nic_count] = fi_dupinfo(prov_iter);
+                        nic_count++;
+                    }
+                }
+                else {
+                    LOG_DEBUG("nic ", atl_ofi_get_nic_name(prov_iter), " already used");
+                }
+            }
+            prov_iter = prov_iter->next;
+        }
+
+        if (nic_count == 0) {
+            LOG_INFO("can not find nic(s) according to mnic_type ",
+                     ofi_ctx->mnic_type,
+                     ", use first available nic ",
+                     atl_ofi_get_nic_name(prov_list));
+            ofi_ctx->nw_prov_count = 1;
+            filtered_prov_list[0] = fi_dupinfo(prov_list);
+        }
+        else {
+            LOG_INFO("found ", nic_count, " nic(s) according to mnic_type ", ofi_ctx->mnic_type);
+            ofi_ctx->nw_prov_count = nic_count;
+        }
+
+        for (idx = 0; idx < ofi_ctx->nw_prov_count; idx++) {
+            prov_idx = ofi_ctx->nw_prov_first_idx + idx;
+            prov = &ofi_ctx->provs[prov_idx];
+            prov->idx = prov_idx;
+            prov->is_shm = 0;
+            ATL_CALL(atl_ofi_prov_init(ctx, filtered_prov_list[idx], prov, pmi), goto err);
+        }
+
+        for (idx = 0; idx < ofi_ctx->nw_prov_count; idx++) {
+            fi_freeinfo(filtered_prov_list[idx]);
+        }
+    }
+    ofi_ctx->prov_count += ofi_ctx->nw_prov_count;
+
+    fi_freeinfo(prov_list);
+
+    return ATL_STATUS_SUCCESS;
+
+err:
+    LOG_ERROR("can not open network providers");
+    return ATL_STATUS_FAILURE;
+}
+
 static atl_status_t atl_ofi_init(int* argc,
                                  char*** argv,
                                  atl_attr_t* attr,
                                  atl_ctx_t** out_ctx,
                                  const char* main_addr,
                                  ipmi* pmi) {
-    struct fi_info *providers, *base_hints, *prov_hints;
-    struct fi_av_attr av_attr;
+    struct fi_info *prov_list = NULL, *base_hints = NULL, *prov_hints = NULL;
     int fi_version;
-    ssize_t ret;
-    size_t idx, ep_idx;
-
-    providers = NULL;
-    base_hints = NULL;
-    prov_hints = NULL;
-
-    memset(&av_attr, 0, sizeof(av_attr));
-
-    ret = 0;
-    idx = 0;
-    ep_idx = 0;
+    ssize_t ret = 0;
+    size_t idx = 0, ep_idx = 0, prov_idx = 0;
+    char* prov_name = NULL;
+    atl_ofi_prov_t* prov = NULL;
+    char *max_retry_count_env = NULL, *progress_mode_env = NULL;
+    int open_nw_provs = 1;
+    int enable_shm = 0;
 
     CCL_THROW_IF_NOT((sizeof(atl_ofi_req_t) <= sizeof(atl_req_t) - offsetof(atl_req_t, internal)),
                      "unexpected offset: atl_ofi_request size ",
@@ -1561,8 +1905,8 @@ static atl_status_t atl_ofi_init(int* argc,
 
     ctx->ops = &atl_ofi_ops;
     ctx->mr_ops = &atl_ofi_mr_ops;
-    ctx->ep_count = attr->ep_count;
-    ctx->eps = (atl_ep**)calloc(1, sizeof(void*) * attr->ep_count);
+    ctx->ep_count = attr->in.ep_count;
+    ctx->eps = (atl_ep**)calloc(1, sizeof(void*) * attr->in.ep_count);
     if (!ctx->eps)
         goto err;
 
@@ -1609,233 +1953,78 @@ static atl_status_t atl_ofi_init(int* argc,
             goto err;
         }
 
-        if (!attr->enable_shm) {
+        if (!attr->in.enable_shm) {
             LOG_ERROR(
                 "shm provider is requested through FI_PROVIDER but not requested from CCL level");
             goto err;
         }
     }
+
     atl_ofi_print_coord(coord);
 
-    if (attr->enable_shm) {
+    enable_shm = attr->in.enable_shm;
+    if (enable_shm) {
         prov_hints = fi_dupinfo(base_hints);
         prov_hints->fabric_attr->prov_name = strdup(ATL_OFI_SHM_PROV_NAME);
-        ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers);
-        if (ret || !providers) {
-            attr->enable_shm = 0;
+        ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &prov_list);
+        if (ret || !prov_list) {
+            enable_shm = 0;
             LOG_INFO("shm provider is requested but not available");
         }
         else {
             LOG_INFO("shm provider is requested and available");
         }
 
-        fi_freeinfo(providers);
-        providers = NULL;
+        fi_freeinfo(prov_list);
+        prov_list = NULL;
 
         fi_freeinfo(prov_hints);
         prov_hints = NULL;
-
-        if (attr->enable_shm) {
-            /* TODO: tmp code to detect CMA, remove when OFI/shm will have runtime detection */
-            int scope = 0, fret;
-            FILE* file;
-            file = fopen("/proc/sys/kernel/yama/ptrace_scope", "r");
-            if (file) {
-                fret = fscanf(file, "%d", &scope);
-                if (fret != 1) {
-                    LOG_ERROR("error getting value from ptrace_scope");
-                    scope = 1;
-                }
-                fret = fclose(file);
-                if (fret) {
-                    LOG_ERROR("error closing ptrace_scope file");
-                    scope = 1;
-                }
-            }
-
-            if (!file && (errno != ENOENT)) {
-                LOG_ERROR("can't open ptrace_scope file, disable shm provider");
-                attr->enable_shm = 0;
-            }
-            else if (scope) {
-                LOG_ERROR("ptrace_scope > 0, disable shm provider");
-                attr->enable_shm = 0;
-            }
-        }
     }
 
-    attr->tag_bits = 64;
-    attr->max_tag = 0xFFFFFFFFFFFFFFFF;
-
-    if (coord->global_count == coord->local_count) {
-        ofi_ctx->prov_count = 1;
-        ofi_ctx->provs[0].is_shm = (attr->enable_shm) ? 1 : 0;
-    }
-    else {
-        if (attr->enable_shm) {
-            ofi_ctx->prov_count = 2;
-            ofi_ctx->shm_prov_idx = 0;
-            ofi_ctx->nw_prov_idx = 1;
-            ofi_ctx->provs[ofi_ctx->shm_prov_idx].is_shm = 1;
-            ofi_ctx->provs[ofi_ctx->nw_prov_idx].is_shm = 0;
-        }
-        else {
-            ofi_ctx->prov_count = 1;
-            ofi_ctx->provs[0].is_shm = 0;
+    ofi_ctx->prov_count = 0;
+    ofi_ctx->nw_prov_count = 0;
+    ofi_ctx->shm_prov_idx = 0;
+    ofi_ctx->nw_prov_first_idx = (enable_shm) ? 1 : 0;
+    ofi_ctx->mnic_type = attr->in.mnic_type;
+    ofi_ctx->mnic_count = std::min(attr->in.mnic_count, (size_t)(ATL_OFI_MAX_NW_PROV_COUNT));
+    ofi_ctx->mnic_count = std::min(ofi_ctx->mnic_count, attr->in.ep_count);
+    ofi_ctx->mnic_count = std::max(ofi_ctx->mnic_count, (size_t)(1));
+
+    if ((ofi_ctx->mnic_type != ATL_MNIC_NONE) && !hwloc_is_initialized()) {
+        hwloc_status_t hwloc_status = hwloc_init();
+        if (hwloc_status != HWLOC_SUCCESS) {
+            ofi_ctx->mnic_type = ATL_MNIC_NONE;
+            ofi_ctx->mnic_count = 1;
+            LOG_WARN("can't init hwloc, disable multi-nic")
         }
     }
 
-    if (attr->enable_rma && (ofi_ctx->prov_count > 1)) {
-        LOG_INFO("RMA and multiple providers requested both, disable RMA");
-        attr->enable_rma = 0;
+    /* open SHM provider */
+    if (enable_shm) {
+        prov_idx = ofi_ctx->shm_prov_idx;
+        prov_name = strdup(ATL_OFI_SHM_PROV_NAME);
+        prov = &ofi_ctx->provs[prov_idx];
+        prov->idx = prov_idx;
+        prov->is_shm = 1;
+        ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err);
+        ATL_CALL(atl_ofi_prov_init(ctx, prov_list, prov, pmi), goto err);
+        free(prov_name);
+        fi_freeinfo(prov_list);
+        ofi_ctx->prov_count++;
     }
 
-    if (coord->global_idx == 0)
-        LOG_INFO("prov_count ", ofi_ctx->prov_count);
-
-    for (idx = 0; idx < ofi_ctx->prov_count; idx++) {
-        atl_ofi_prov_t* prov;
-        prov = &ofi_ctx->provs[idx];
-
-        prov_hints = fi_dupinfo(base_hints);
-
-        char* prov_name = NULL;
-
-        if (prov->is_shm)
-            prov_name = strdup(ATL_OFI_SHM_PROV_NAME);
-        else {
-            if (strlen(global_data.prov_env_copy) && !strstr(global_data.prov_env_copy, ","))
-                prov_name = strdup(global_data.prov_env_copy);
-            else
-                prov_name = NULL;
-        }
-
-        LOG_DEBUG("request provider: idx ",
-                  idx,
-                  ", name ",
-                  (prov_name) ? prov_name : "<default>",
-                  ", is_shm ",
-                  prov->is_shm);
-
-        prov_hints->fabric_attr->prov_name = prov_name;
-
-        ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers);
-
-        if (ret || !providers) {
-            LOG_ERROR(
-                "fi_getinfo erro: ret ", ret, ", providers ", (void*)providers, ", prov_idx ", idx);
-            goto err;
-        }
-
-        if (providers->domain_attr->max_ep_tx_ctx > 1) {
-            prov_hints->ep_attr->tx_ctx_cnt = attr->ep_count;
-            prov_hints->ep_attr->rx_ctx_cnt = attr->ep_count;
-        }
-        else {
-            prov_hints->ep_attr->tx_ctx_cnt = 1;
-            prov_hints->ep_attr->rx_ctx_cnt = 1;
-        }
-
-        fi_freeinfo(providers);
-        providers = NULL;
-
-        if (attr->enable_rma) {
-            LOG_INFO("try to enable RMA");
-            prov_hints->caps |= FI_RMA | FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE;
-            prov_hints->domain_attr->mr_mode = FI_MR_UNSPEC;
-            // TODO:
-            //hints->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR | FI_MR_LOCAL | FI_MR_BASIC;
-        }
-
-        ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers);
-        if (ret || !providers) {
-            if (attr->enable_rma) {
-                attr->enable_rma = 0;
-                LOG_INFO("try without RMA");
-                prov_hints->caps = FI_TAGGED;
-                prov_hints->domain_attr->mr_mode = FI_MR_UNSPEC;
-                ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, prov_hints, &providers);
-                if (ret || !providers) {
-                    LOG_ERROR("fi_getinfo error (rma fallback), prov_idx ", idx);
-                    goto err;
-                }
-            }
-            else {
-                LOG_ERROR("fi_getinfo error (main path), prov_idx ", idx);
-                goto err;
-            }
-        }
-
-        /* use first provider from the list of providers */
-        prov->info = fi_dupinfo(providers);
-        struct fi_info* prov_info;
-        prov_info = prov->info;
-        if (!prov_info) {
-            LOG_ERROR("fi_dupinfo error");
-            goto err;
-        }
-
-        fi_freeinfo(providers);
-        providers = NULL;
-
-        attr->max_order_waw_size =
-            (idx == 0) ? prov_info->ep_attr->max_order_waw_size
-                       : MIN(attr->max_order_waw_size, prov_info->ep_attr->max_order_waw_size);
-
-        prov->max_msg_size = prov_info->ep_attr->max_msg_size;
-
-        if (coord->global_idx == 0) {
-            LOG_INFO("provider: ", prov_info->fabric_attr->prov_name);
-            LOG_INFO("  mr_mode: ", prov_info->domain_attr->mr_mode);
-            LOG_INFO("  threading: ", prov_info->domain_attr->threading);
-            LOG_INFO("  tx_ctx_cnt: ", prov_info->domain_attr->tx_ctx_cnt);
-            LOG_INFO("  max_ep_tx_ctx: ", prov_info->domain_attr->max_ep_tx_ctx);
-            LOG_INFO("  max_msg_size: ", prov_info->ep_attr->max_msg_size);
-        }
-
-        ATL_OFI_CALL(fi_fabric(prov_info->fabric_attr, &prov->fabric, NULL), ret, goto err);
-
-        ATL_OFI_CALL(fi_domain(prov->fabric, prov_info, &prov->domain, NULL), ret, goto err);
-
-        av_attr.type = FI_AV_TABLE;
-        av_attr.rx_ctx_bits = prov->rx_ctx_bits = (int)ceil(log2(prov_hints->ep_attr->rx_ctx_cnt));
-
-        ATL_OFI_CALL(fi_av_open(prov->domain, &av_attr, &prov->av, NULL), ret, goto err);
-
-        if (prov_info->domain_attr->max_ep_tx_ctx > 1) {
-            ATL_OFI_CALL(fi_scalable_ep(prov->domain, prov_info, &prov->sep, NULL), ret, goto err);
-            ATL_OFI_CALL(fi_scalable_ep_bind(prov->sep, &prov->av->fid, 0), ret, goto err);
-        }
-
-        prov->eps = (atl_ofi_prov_ep_t*)calloc(1, sizeof(atl_ofi_prov_ep_t) * attr->ep_count);
-        if (!prov->eps) {
-            LOG_ERROR("can't allocate prov->eps");
-            goto err;
-        }
-
-        for (ep_idx = 0; ep_idx < attr->ep_count; ep_idx++) {
-            ret = atl_ofi_prov_ep_init(prov, ep_idx);
-            if (ret) {
-                LOG_ERROR("atl_ofi_prov_ep_init error");
-                goto err;
-            }
-        }
-
-        if (prov->sep) {
-            fi_enable(prov->sep);
-        }
-
-        ret = atl_ofi_prov_eps_connect(ofi_ctx, idx, pmi);
-        if (ret) {
-            LOG_ERROR("atl_ofi_prov_eps_connect error, prov_idx ", idx);
-            goto err;
-        }
+    /* open NW provider(s) */
+    if (prov_env && !strcmp(prov_env, ATL_OFI_SHM_PROV_NAME) && enable_shm) {
+        open_nw_provs = 0;
+    }
 
-        fi_freeinfo(prov_hints);
-        prov_hints = NULL;
-    } /* prov loop */
+    if (open_nw_provs) {
+        ATL_CALL(atl_ofi_open_nw_provs(ctx, base_hints, pmi), goto err);
+        ofi_ctx->mnic_count = ofi_ctx->nw_prov_count;
+    }
 
-    for (ep_idx = 0; ep_idx < attr->ep_count; ep_idx++) {
+    for (ep_idx = 0; ep_idx < ctx->ep_count; ep_idx++) {
         atl_ofi_ep_t* ofi_ep;
         ofi_ep = (atl_ofi_ep_t*)calloc(1, sizeof(atl_ofi_ep_t));
         if (!ofi_ep) {
@@ -1852,12 +2041,31 @@ static atl_status_t atl_ofi_init(int* argc,
         ep->rma_ops = &atl_ofi_ep_rma_ops;
         ep->comp_ops = &atl_ofi_ep_comp_ops;
 
+        ofi_ep->active_prov_count = 0;
+        if (enable_shm) {
+            ofi_ep->active_prov_idxs[ofi_ep->active_prov_count] = ofi_ctx->shm_prov_idx;
+            ofi_ep->active_prov_count++;
+        }
+        if (open_nw_provs) {
+            ofi_ep->active_prov_idxs[ofi_ep->active_prov_count] =
+                ofi_ctx->nw_prov_first_idx + ep_idx % ofi_ctx->nw_prov_count;
+            ofi_ep->active_prov_count++;
+        }
+        CCL_THROW_IF_NOT(ofi_ep->active_prov_count, "no active providers for ep_idx ", ep_idx);
+
+        if (coord->global_idx == 0) {
+            std::stringstream ss;
+            for (idx = 0; idx < ofi_ep->active_prov_count; idx++) {
+                ss << ofi_ep->active_prov_idxs[idx] << " ";
+            }
+            LOG_INFO("ep_idx: ", ep_idx, ", active_prov_idxs: ", ss.str());
+        }
+
         ctx->eps[ep_idx] = ep;
     }
 
     pmi->pmrt_barrier();
 
-    char* max_retry_count_env;
     max_retry_count_env = getenv(ATL_OFI_MAX_RETRY_COUNT_ENV);
     if (max_retry_count_env) {
         ofi_ctx->max_retry_count = safe_c_strtol(max_retry_count_env, NULL, 10);
@@ -1873,7 +2081,6 @@ static atl_status_t atl_ofi_init(int* argc,
         ofi_ctx->progress_mode = ATL_PROGRESS_POLL;
     }
 
-    char* progress_mode_env;
     progress_mode_env = getenv(ATL_PROGRESS_MODE_ENV);
     if (progress_mode_env) {
         ofi_ctx->progress_mode = static_cast<atl_progress_mode_t>(atoi(progress_mode_env));
@@ -1882,6 +2089,12 @@ static atl_status_t atl_ofi_init(int* argc,
     if (coord->global_idx == 0) {
         LOG_INFO("atl-ofi-ctx:");
         LOG_INFO("  new ctx_count: ", global_data.ctx_count);
+        LOG_INFO("  prov_count: ", ofi_ctx->prov_count);
+        LOG_INFO("  nw_prov_count: ", ofi_ctx->nw_prov_count);
+        LOG_INFO("  nw_prov_first_idx: ", ofi_ctx->nw_prov_first_idx);
+        LOG_INFO("  mnic_type: ", ofi_ctx->mnic_type);
+        if (ofi_ctx->mnic_type != ATL_MNIC_NONE)
+            LOG_INFO("  mnic_count: ", ofi_ctx->mnic_count);
         LOG_INFO("  max_retry_count: ", ofi_ctx->max_retry_count);
         LOG_INFO("  progress_mode: ", ofi_ctx->progress_mode);
     }
@@ -1891,13 +2104,23 @@ static atl_status_t atl_ofi_init(int* argc,
     fi_freeinfo(base_hints);
     base_hints = NULL;
 
+    /* report actual attributes back to upper level */
+    attr->out.enable_shm = enable_shm;
+    attr->out.enable_rma = 0;
+    attr->out.enable_device_buf = 0;
+    attr->out.mnic_type = ofi_ctx->mnic_type;
+    attr->out.mnic_count = ofi_ctx->mnic_count;
+    attr->out.tag_bits = 64;
+    attr->out.max_tag = 0xFFFFFFFFFFFFFFFF;
+    attr->out.max_order_waw_size = 0;
+
     return ATL_STATUS_SUCCESS;
 
 err:
     LOG_ERROR("can't find suitable provider");
 
-    if (providers) {
-        fi_freeinfo(providers);
+    if (prov_list) {
+        fi_freeinfo(prov_list);
     }
 
     if (base_hints) {
@@ -1911,7 +2134,7 @@ static atl_status_t atl_ofi_init(int* argc,
     if (ctx != NULL)
         atl_ofi_finalize(ctx);
 
-    return RET2ATL(ret);
+    return ATL_STATUS_FAILURE;
 }
 
 atl_status_t atl_ofi_main_addr_reserve(char* main_addr) {
diff --git a/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt b/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp
index a8979efa9..280a720b0 100644
--- a/src/ccl_api_functions.cpp
+++ b/src/ccl_api_functions.cpp
@@ -53,12 +53,6 @@ void register_gpu_module(std::string kernels_path) {
 
     LOG_INFO("SPIRV kernels directory: ", kernels_path);
 
-    /*
-     * TODO:
-     * Important: Fix kernels data types generations, then uncoment
-     * the registration module.
-     */
-
     load_gpu_module(
         kernels_path + "ring_allgatherv.spv", ccl::device_topology_type::ring, ccl_coll_allgatherv);
     load_gpu_module(
@@ -214,7 +208,6 @@ event allgatherv(const void* send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
@@ -228,7 +221,6 @@ event allgatherv(const void* send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
@@ -243,7 +235,6 @@ event allgatherv(const void* send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
@@ -257,7 +248,6 @@ event allgatherv(const void* send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
@@ -272,7 +262,6 @@ event allgatherv(const BufferType* send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
@@ -286,7 +275,6 @@ event allgatherv(const BufferType* send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
@@ -301,7 +289,6 @@ event allgatherv(const BufferType* send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
@@ -315,7 +302,6 @@ event allgatherv(const BufferType* send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
@@ -330,7 +316,6 @@ event allgatherv(const BufferObjectType& send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
@@ -344,7 +329,6 @@ event allgatherv(const BufferObjectType& send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
@@ -359,7 +343,6 @@ event allgatherv(const BufferObjectType& send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
@@ -373,7 +356,6 @@ event allgatherv(const BufferObjectType& send_buf,
                  const communicator& comm,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
@@ -389,7 +371,6 @@ event allreduce(const void* send_buf,
                 const stream& op_stream,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(
         send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps);
@@ -403,7 +384,6 @@ event allreduce(const void* send_buf,
                 const communicator& comm,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(
         send_buf, recv_buf, count, dtype, reduction, disp(default_stream), attr, deps);
@@ -418,7 +398,6 @@ event allreduce(const BufferType* send_buf,
                 const stream& op_stream,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
 }
@@ -431,7 +410,6 @@ event allreduce(const BufferType* send_buf,
                 const communicator& comm,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(
         send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
@@ -446,7 +424,6 @@ event allreduce(const BufferObjectType& send_buf,
                 const stream& op_stream,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
 }
@@ -459,7 +436,6 @@ event allreduce(const BufferObjectType& send_buf,
                 const communicator& comm,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->allreduce(
         send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
@@ -474,7 +450,6 @@ event alltoall(const void* send_buf,
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
 }
@@ -486,7 +461,6 @@ event alltoall(const void* send_buf,
                const communicator& comm,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(default_stream), attr, deps);
 }
@@ -499,7 +473,6 @@ event alltoall(const vector_class<void*>& send_buf,
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
 }
@@ -512,7 +485,6 @@ event alltoall(const BufferType* send_buf,
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
 }
@@ -524,7 +496,6 @@ event alltoall(const BufferType* send_buf,
                const communicator& comm,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
 }
@@ -537,7 +508,6 @@ event alltoall(const vector_class<BufferType*>& send_buf,
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
 }
@@ -549,7 +519,6 @@ event alltoall(const vector_class<BufferType*>& send_buf,
                const communicator& comm,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
 }
@@ -562,7 +531,6 @@ event alltoall(const BufferObjectType& send_buf,
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
 }
@@ -574,7 +542,6 @@ event alltoall(const BufferObjectType& send_buf,
                const communicator& comm,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
 }
@@ -587,7 +554,6 @@ event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& se
                const stream& op_stream,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
 }
@@ -599,7 +565,6 @@ event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& se
                const communicator& comm,
                const alltoall_attr& attr,
                const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
 }
@@ -614,7 +579,6 @@ event alltoallv(const void* send_buf,
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
@@ -628,7 +592,6 @@ event alltoallv(const void* send_buf,
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
@@ -643,7 +606,6 @@ event alltoallv(const vector_class<void*>& send_bufs,
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
@@ -657,7 +619,6 @@ event alltoallv(const vector_class<void*>& send_bufs,
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
@@ -672,7 +633,6 @@ event alltoallv(const BufferType* send_buf,
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
@@ -686,7 +646,6 @@ event alltoallv(const BufferType* send_buf,
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
@@ -701,7 +660,6 @@ event alltoallv(const vector_class<BufferType*>& send_bufs,
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
@@ -715,7 +673,6 @@ event alltoallv(const vector_class<BufferType*>& send_bufs,
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
@@ -730,7 +687,6 @@ event alltoallv(const BufferObjectType& send_buf,
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
@@ -744,7 +700,6 @@ event alltoallv(const BufferObjectType& send_buf,
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
@@ -759,7 +714,6 @@ event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& s
                 const stream& op_stream,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
@@ -773,7 +727,6 @@ event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& s
                 const communicator& comm,
                 const alltoallv_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->alltoallv(
         send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
@@ -784,13 +737,11 @@ event barrier(const communicator& comm,
               const stream& op_stream,
               const barrier_attr& attr,
               const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->barrier(disp(op_stream), attr, deps);
 }
 
 event barrier(const communicator& comm, const barrier_attr& attr, const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->barrier(disp(default_stream), attr, deps);
 }
@@ -804,7 +755,6 @@ event broadcast(void* buf,
                 const stream& op_stream,
                 const broadcast_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, dtype, root, disp(op_stream), attr, deps);
 }
@@ -816,7 +766,6 @@ event broadcast(void* buf,
                 const communicator& comm,
                 const broadcast_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, dtype, root, disp(default_stream), attr, deps);
 }
@@ -831,7 +780,6 @@ event broadcast(BufferType* buf,
                 const vector_class<event>& deps)
 
 {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
 }
@@ -845,7 +793,6 @@ event broadcast(BufferType* buf,
                 const vector_class<event>& deps)
 
 {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
 }
@@ -858,7 +805,6 @@ event broadcast(BufferObjectType& buf,
                 const stream& op_stream,
                 const broadcast_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
 }
@@ -870,7 +816,6 @@ event broadcast(BufferObjectType& buf,
                 const communicator& comm,
                 const broadcast_attr& attr,
                 const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
 }
@@ -886,7 +831,6 @@ event reduce(const void* send_buf,
              const stream& op_stream,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, dtype, reduction, root, disp(op_stream), attr, deps);
@@ -901,7 +845,6 @@ event reduce(const void* send_buf,
              const communicator& comm,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, dtype, reduction, root, disp(default_stream), attr, deps);
@@ -917,7 +860,6 @@ event reduce(const BufferType* send_buf,
              const stream& op_stream,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
@@ -932,7 +874,6 @@ event reduce(const BufferType* send_buf,
              const communicator& comm,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
@@ -948,7 +889,6 @@ event reduce(const BufferObjectType& send_buf,
              const stream& op_stream,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
@@ -963,7 +903,6 @@ event reduce(const BufferObjectType& send_buf,
              const communicator& comm,
              const reduce_attr& attr,
              const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce(
         send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
@@ -979,7 +918,6 @@ event reduce_scatter(const void* send_buf,
                      const stream& op_stream,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps);
@@ -993,7 +931,6 @@ event reduce_scatter(const void* send_buf,
                      const communicator& comm,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, dtype, reduction, disp(default_stream), attr, deps);
@@ -1008,7 +945,6 @@ event reduce_scatter(const BufferType* send_buf,
                      const stream& op_stream,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
@@ -1022,7 +958,6 @@ event reduce_scatter(const BufferType* send_buf,
                      const communicator& comm,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
@@ -1037,7 +972,6 @@ event reduce_scatter(const BufferObjectType& send_buf,
                      const stream& op_stream,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
@@ -1051,7 +985,6 @@ event reduce_scatter(const BufferObjectType& send_buf,
                      const communicator& comm,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
-    CHECK_DEPS(deps);
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
@@ -1077,7 +1010,6 @@ ccl::event sparse_allreduce(const void* send_ind_buf,
                             const ccl::stream& op_stream,
                             const ccl::sparse_allreduce_attr& attr,
                             const ccl::vector_class<ccl::event>& deps) {
-    CHECK_DEPS(deps);
     ccl::impl_dispatch disp;
     return disp(comm)->sparse_allreduce(send_ind_buf,
                                         send_ind_count,
@@ -1109,7 +1041,6 @@ ccl::event sparse_allreduce(const void* send_ind_buf,
                             const ccl::communicator& comm,
                             const ccl::sparse_allreduce_attr& attr,
                             const ccl::vector_class<ccl::event>& deps) {
-    CHECK_DEPS(deps);
     ccl::impl_dispatch disp;
     return disp(comm)->sparse_allreduce(send_ind_buf,
                                         send_ind_count,
@@ -1141,7 +1072,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
                             const ccl::stream& op_stream,
                             const ccl::sparse_allreduce_attr& attr,
                             const ccl::vector_class<ccl::event>& deps) {
-    CHECK_DEPS(deps);
     ccl::impl_dispatch disp;
     return disp(comm)->sparse_allreduce(send_ind_buf,
                                         send_ind_count,
@@ -1170,7 +1100,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
                             const ccl::communicator& comm,
                             const ccl::sparse_allreduce_attr& attr,
                             const ccl::vector_class<ccl::event>& deps) {
-    CHECK_DEPS(deps);
     ccl::impl_dispatch disp;
     return disp(comm)->sparse_allreduce(send_ind_buf,
                                         send_ind_count,
@@ -1202,7 +1131,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
 //                  const ccl::sparse_allreduce_attr& attr,
 //                  const ccl::vector_class<ccl::event>& deps)
 // {
-//     CHECK_DEPS(deps);
 //     ccl::impl_dispatch disp;
 //     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
 //                                         send_val_buf, send_val_count,
@@ -1227,7 +1155,6 @@ ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
 //                  const ccl::sparse_allreduce_attr& attr,
 //                  const ccl::vector_class<ccl::event>& deps)
 // {
-//     CHECK_DEPS(deps);
 //     ccl::impl_dispatch disp;
 //     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
 //                                         send_val_buf, send_val_count,
diff --git a/src/ccl_app_api_event.cpp b/src/ccl_app_api_event.cpp
index 2b0dba1e9..28c699c05 100644
--- a/src/ccl_app_api_event.cpp
+++ b/src/ccl_app_api_event.cpp
@@ -74,15 +74,6 @@ event CCL_API event::create_from_native(native_t& native_event) {
     return impl_value_t(new native_event_impl(std::move(ev)));
 }
 
-event CCL_API event::create_from_native(native_handle_t native_event_handle, context_t context) {
-    auto version = utils::get_library_version();
-
-    auto ev = std::unique_ptr<ccl_event>(new ccl_event(native_event_handle, context, version));
-    ev->build_from_params();
-
-    return impl_value_t(new native_event_impl(std::move(ev)));
-}
-
 } // namespace v1
 
 } // namespace ccl
diff --git a/src/coll/algorithms/algorithm_utils.cpp b/src/coll/algorithms/algorithm_utils.cpp
new file mode 100644
index 000000000..98214594a
--- /dev/null
+++ b/src/coll/algorithms/algorithm_utils.cpp
@@ -0,0 +1,42 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/algorithms_enum.hpp"
+
+bool ccl_coll_type_is_reduction(ccl_coll_type ctype) {
+    switch (ctype) {
+        case ccl_coll_allreduce:
+        case ccl_coll_reduce:
+        case ccl_coll_reduce_scatter: return true;
+        default: return false;
+    }
+}
+
+const char* ccl_coll_type_to_str(ccl_coll_type type) {
+    switch (type) {
+        case ccl_coll_allgatherv: return "allgatherv";
+        case ccl_coll_allreduce: return "allreduce";
+        case ccl_coll_alltoall: return "alltoall";
+        case ccl_coll_alltoallv: return "alltoallv";
+        case ccl_coll_barrier: return "barrier";
+        case ccl_coll_bcast: return "bcast";
+        case ccl_coll_reduce: return "reduce";
+        case ccl_coll_reduce_scatter: return "reduce_scatter";
+        case ccl_coll_sparse_allreduce: return "sparse_allreduce";
+        case ccl_coll_internal: return "internal";
+        default: return "unknown";
+    }
+    return "unknown";
+}
diff --git a/src/coll/algorithms/algorithms_enum.hpp b/src/coll/algorithms/algorithms_enum.hpp
index 07e725e8d..7d52cbc30 100644
--- a/src/coll/algorithms/algorithms_enum.hpp
+++ b/src/coll/algorithms/algorithms_enum.hpp
@@ -16,6 +16,8 @@
 #pragma once
 #include "common/utils/enums.hpp"
 
+#include "oneapi/ccl/types.hpp"
+
 #define CCL_COLL_LIST \
     ccl_coll_allgatherv, ccl_coll_allreduce, ccl_coll_alltoall, ccl_coll_alltoallv, \
         ccl_coll_barrier, ccl_coll_bcast, ccl_coll_reduce, ccl_coll_reduce_scatter, \
@@ -117,6 +119,24 @@ enum ccl_coll_type {
     ccl_coll_last_value
 };
 
+// Currently ccl_coll_type is used in both compile-time and run-time contexts, so
+// need to have both versions of the check.
+// It's possible to have a constexpr function, but it requires some features from c++14
+// (e.g. multiple returns in constexpr functions)
+
+template <ccl_coll_type ctype, class Enable = void>
+struct is_reduction_coll_type : std::false_type {};
+
+// Reduction types
+template <ccl_coll_type ctype>
+struct is_reduction_coll_type<
+    ctype,
+    typename std::enable_if<ctype == ccl_coll_allreduce || ctype == ccl_coll_reduce ||
+                            ctype == ccl_coll_reduce_scatter>::type> : std::true_type {};
+
+bool ccl_coll_type_is_reduction(ccl_coll_type ctype);
+const char* ccl_coll_type_to_str(ccl_coll_type type);
+
 #define CCL_COLL_TYPE_LIST \
     ccl_coll_type::ccl_coll_allgatherv, ccl_coll_type::ccl_coll_allreduce, \
         ccl_coll_type::ccl_coll_alltoall, ccl_coll_type::ccl_coll_alltoallv, \
@@ -124,23 +144,6 @@ enum ccl_coll_type {
         ccl_coll_type::ccl_coll_reduce, ccl_coll_type::ccl_coll_reduce_scatter, \
         ccl_coll_type::ccl_coll_sparse_allreduce
 
-inline const char* ccl_coll_type_to_str(ccl_coll_type type) {
-    switch (type) {
-        case ccl_coll_allgatherv: return "allgatherv";
-        case ccl_coll_allreduce: return "allreduce";
-        case ccl_coll_alltoall: return "alltoall";
-        case ccl_coll_alltoallv: return "alltoallv";
-        case ccl_coll_barrier: return "barrier";
-        case ccl_coll_bcast: return "bcast";
-        case ccl_coll_reduce: return "reduce";
-        case ccl_coll_reduce_scatter: return "reduce_scatter";
-        case ccl_coll_sparse_allreduce: return "sparse_allreduce";
-        case ccl_coll_internal: return "internal";
-        default: return "unknown";
-    }
-    return "unknown";
-}
-
 enum ccl_coll_reduction {
     sum,
     prod,
@@ -152,11 +155,10 @@ enum ccl_coll_reduction {
 };
 
 #define REDUCE_TYPES \
-    ccl_coll_reduction::sum, ccl_coll_reduction::prod, ccl_coll_reduction::min, \
-        ccl_coll_reduction::max /*, ccl_coll_reduction::custom*/
+    ccl::reduction::sum, ccl::reduction::prod, ccl::reduction::min, \
+        ccl::reduction::max /*, ccl::reduction::custom*/
 
-using ccl_coll_reductions = utils::enum_to_str<static_cast<int>(ccl_coll_reduction::last_value)>;
-inline const std::string reduction_to_str(ccl_coll_reduction reduction_type) {
-    return ccl_coll_reductions({ "sum", "prod", "min", "max" })
-        .choose(reduction_type, "INVALID_VALUE");
+using ccl_reductions = utils::enum_to_str<static_cast<int>(ccl::reduction::custom)>;
+inline const std::string reduction_to_str(ccl::reduction reduction_type) {
+    return ccl_reductions({ "sum", "prod", "min", "max" }).choose(reduction_type, "INVALID_VALUE");
 }
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.cpp b/src/coll/algorithms/allreduce/allreduce_rma.cpp
index a74c95d92..a91357f6d 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.cpp
@@ -354,7 +354,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         entry->set_field_fn<ccl_sched_entry_field_dst_mr>(
             rma_ring_allreduce_get_remote_rs_dst_buf_mr, ar_handler);
 
-        if (block_count * dtype.size() > atl_wrapper::attr.max_order_waw_size)
+        if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size)
             sched->add_barrier();
 
         entry = entry_factory::make_entry<write_entry>(
@@ -415,7 +415,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         entry->set_field_fn<ccl_sched_entry_field_dst_mr>(rma_ring_allreduce_get_remote_recv_buf_mr,
                                                           ar_handler);
 
-        if (block_count * dtype.size() > atl_wrapper::attr.max_order_waw_size)
+        if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size)
             sched->add_barrier();
 
         entry = entry_factory::make_entry<write_entry>(
diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp
index f9675483b..a411440d7 100644
--- a/src/coll/algorithms/alltoallv.cpp
+++ b/src/coll/algorithms/alltoallv.cpp
@@ -40,6 +40,17 @@ ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
     return ccl::status::success;
 }
 
+ccl::status ccl_coll_add_scatter_alltoallv_plain_barriers(std::vector<ccl_sched*>& scheds) {
+    if (ccl::global_data::env().alltoall_scatter_plain) {
+        ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops;
+        for (auto s : scheds) {
+            if (s->entries_count() % max_ops == 0)
+                s->add_barrier();
+        }
+    }
+    return ccl::status::success;
+}
+
 ccl::status ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& scheds,
                                                     size_t sched_idx) {
     ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops;
@@ -48,12 +59,7 @@ ccl::status ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& sch
         if (scheds[sched_idx]->entries_count() % max_ops == 0)
             scheds[sched_idx]->add_barrier();
 
-        if (ccl::global_data::env().alltoall_scatter_plain) {
-            for (auto s : scheds) {
-                if (s->entries_count() % max_ops == 0)
-                    s->add_barrier();
-            }
-        }
+        ccl_coll_add_scatter_alltoallv_plain_barriers(scheds);
     }
 
     return ccl::status::success;
@@ -277,8 +283,6 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
 
         entry_factory::make_chunked_recv_entry(
             scheds, sched_idx, recv_buf, recv_counts[src], dtype, src, comm);
-
-        ccl_coll_add_scatter_alltoallv_barriers(scheds, sched_idx);
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
@@ -300,8 +304,6 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
                                                dtype,
                                                dst,
                                                comm);
-
-        ccl_coll_add_scatter_alltoallv_barriers(scheds, sched_idx);
     }
 
     if (!inplace)
@@ -345,6 +347,13 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
     size_t total_send_count = 0, total_recv_count = 0;
     size_t total_send_bytes = 0, total_recv_bytes = 0;
 
+    ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops;
+    if (max_ops != CCL_ENV_SIZET_NOT_SPECIFIED) {
+        for (size_t idx = 0; idx < sched_count; idx++) {
+            scheds[idx]->flow_control.set_max_credits(max_ops);
+        }
+    }
+
     bool inplace =
         (coll_param.send_buf && (coll_param.send_buf == coll_param.recv_buf)) ? true : false;
 
@@ -419,8 +428,6 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
 
         entry_factory::make_chunked_recv_entry(
             recv_scheds, sched_idx, recv_buf, recv_counts[src], dtype, src, comm);
-
-        ccl_coll_add_scatter_alltoallv_barriers(recv_scheds, sched_idx);
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
@@ -442,8 +449,6 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
                                                dtype,
                                                dst,
                                                comm);
-
-        ccl_coll_add_scatter_alltoallv_barriers(send_scheds, sched_idx);
     }
 
     if (!inplace)
diff --git a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
index 2daba7276..4e24e172a 100644
--- a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
+++ b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
@@ -333,7 +333,8 @@ ccl::status sparse_reduce_ring(const void* ctx) {
     for (size_t idx = 0; idx < sa_handler->send_count[0]; idx++) {
         auto it = sa_handler->iv_map->find(rcv_i[idx]);
         if (it != sa_handler->iv_map->end()) {
-            ccl_comp_reduce((void*)(rcv_v + idx * sa_handler->val_dim_cnt),
+            ccl_comp_reduce(sa_handler->sched,
+                            (void*)(rcv_v + idx * sa_handler->val_dim_cnt),
                             sa_handler->val_dim_cnt,
                             snd_v + it->second[0],
                             nullptr,
@@ -548,7 +549,7 @@ ccl::status ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
     /* get value dimension */
     size_t val_dim_cnt = send_val_count / send_ind_count;
 
-    CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null");
+    CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null");
     CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null");
 
     void** r_ind_buf = recv_ind_buf;
@@ -767,7 +768,7 @@ ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
     /* get value dimension */
     size_t val_dim_cnt = send_val_count / send_ind_count;
 
-    CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null");
+    CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null");
     CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null");
 
     void** r_ind_buf = recv_ind_buf;
@@ -1077,7 +1078,7 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
     /* get value dimension */
     size_t val_dim_cnt = send_val_count / send_ind_count;
 
-    CCL_ASSERT(recv_ind_buf && recv_ind_buf, "recv buffers are null");
+    CCL_ASSERT(recv_ind_buf && recv_val_buf, "recv buffers are null");
     CCL_ASSERT(recv_ind_count && recv_val_count, "recv counts are null");
 
     void** r_ind_buf = recv_ind_buf;
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index 5e19b3901..fe1622f9e 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include <numeric>
+
 #include "oneapi/ccl/types.hpp"
 #include "oneapi/ccl/aliases.hpp"
 
@@ -45,10 +47,12 @@
 #include "coll/ccl_reduce_op_attr.hpp"
 #include "coll/ccl_reduce_scatter_op_attr.hpp"
 #include "coll/ccl_sparse_allreduce_op_attr.hpp"
+#include "coll/coll_param.hpp"
 
 #include "common/global/global.hpp"
 
 #include "coll/algorithms/algorithms.hpp"
+#include "coll/algorithms/algorithms_enum.hpp"
 #include "coll/algorithms/allreduce/allreduce_2d.hpp"
 #include "coll/algorithms/sparse_allreduce/sparse_allreduce.hpp"
 #include "coll/selection/selection.hpp"
@@ -116,8 +120,66 @@ ccl_coll_attr::ccl_coll_attr(const ccl::sparse_allreduce_attr& attr) {
     sparse_coalesce_mode = attr.get<ccl::sparse_allreduce_attr_id::coalesce_mode>();
 }
 
+static void ccl_coll_validate_and_adjust(const ccl_coll_param& param) {
+    // not SYCL, don't need validation
+    if (param.stream == nullptr) {
+        return;
+    }
+
+    // skip validation if it was requested explicitly (e.g. for sycl::buffer)
+    if (param.skip_validation) {
+        return;
+    }
+
+#ifdef CCL_ENABLE_SYCL
+    std::vector<void*> bufs = {};
+
+    switch (param.ctype) {
+        case ccl_coll_alltoallv: {
+            // if the sum of the counts is 0 this means that the buf pointer could be anything,
+            // including nullptr and invalid pointer. We should neither validate nor dereference it.
+            // TODO: make const void*
+            if (std::accumulate(param.send_counts, param.send_counts + param.comm->size(), 0) > 0) {
+                bufs.push_back((void*)(param.send_buf));
+            }
+
+            if (std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0) > 0) {
+                bufs.push_back((void*)(param.recv_buf));
+            }
+            break;
+        }
+        case ccl_coll_allreduce:
+        case ccl_coll_allgatherv:
+        case ccl_coll_alltoall:
+        case ccl_coll_reduce:
+        case ccl_coll_reduce_scatter:
+            bufs = { (void*)param.send_buf, (void*)param.recv_buf };
+            break;
+        case ccl_coll_bcast: bufs = { (void*)param.recv_buf }; break;
+        case ccl_coll_sparse_allreduce:
+            bufs = { (void*)param.sparse_param.send_ind_buf,
+                     (void*)param.sparse_param.send_val_buf,
+                     (void*)param.sparse_param.recv_ind_buf,
+                     (void*)param.sparse_param.recv_val_buf };
+            break;
+        default:
+            // everything that is not a collective, i.e. barrier doesn't require validation
+            return;
+    }
+
+    auto q = param.stream->get_native_stream();
+    CCL_THROW_IF_NOT(
+        native::detail::check_assoc_device_memory(bufs, q.get_device(), q.get_context()) !=
+            native::detail::usm_support_mode::prohibited,
+        "unsupported usm type");
+#endif /* CCL_ENABLE_SYCL */
+}
+
 /* param is not const because param.comm can be updated for unordered colls */
 static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr& attr) {
+    // perform a validation and adjustion if necessary
+    ccl_coll_validate_and_adjust(param);
+
     ccl::global_data& data = ccl::global_data::get();
 
     /* 1. decide whether schedule should be postponed (this includes caching and staring) */
@@ -689,7 +751,9 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf,
                                  ccl::datatype dtype,
                                  const ccl_coll_attr& attr,
                                  ccl_comm* comm,
-                                 const ccl_stream* stream) {
+                                 const ccl_stream* stream,
+                                 const std::vector<ccl::event>& deps,
+                                 bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_allgatherv;
@@ -700,6 +764,8 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf,
     param.dtype = ccl::global_data::get().dtypes->get(dtype);
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req);
@@ -713,7 +779,9 @@ ccl_request* ccl_allreduce_impl(const void* send_buf,
                                 ccl::reduction reduction,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream) {
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_allreduce;
@@ -724,6 +792,8 @@ ccl_request* ccl_allreduce_impl(const void* send_buf,
     param.reduction = reduction;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req, " count ", count);
@@ -736,7 +806,9 @@ ccl_request* ccl_alltoall_impl(const void* send_buf,
                                ccl::datatype dtype,
                                const ccl_coll_attr& attr,
                                ccl_comm* comm,
-                               const ccl_stream* stream) {
+                               const ccl_stream* stream,
+                               const std::vector<ccl::event>& deps,
+                               bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_alltoall;
@@ -746,6 +818,8 @@ ccl_request* ccl_alltoall_impl(const void* send_buf,
     param.dtype = ccl::global_data::get().dtypes->get(dtype);
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req, " count ", count);
@@ -759,7 +833,9 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf,
                                 ccl::datatype dtype,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream) {
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_alltoallv;
@@ -770,6 +846,8 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf,
     param.dtype = ccl::global_data::get().dtypes->get(dtype);
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req);
@@ -784,7 +862,8 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf,
                                     ccl::reduction reduction,
                                     const ccl_coll_attr& attr,
                                     ccl_comm* comm,
-                                    const ccl_stream* stream) {
+                                    const ccl_stream* stream,
+                                    const std::vector<ccl::event>& deps) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_allreduce;
@@ -795,6 +874,7 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf,
     param.reduction = reduction;
     param.stream = stream;
     param.comm = comm;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_gpu_coll_create(param, attr);
     LOG_DEBUG(
@@ -802,13 +882,18 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf,
     return req;
 }
 
-void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream) {
+void ccl_barrier_impl(ccl_comm* comm,
+                      const ccl_stream* stream,
+                      const std::vector<ccl::event>& deps,
+                      bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_barrier;
     param.dtype = ccl_datatype_int8;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     ccl_coll_attr attr{};
     attr.synchronous = 1;
@@ -829,16 +914,20 @@ ccl_request* ccl_broadcast_impl(void* buf,
                                 int root,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream) {
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_bcast;
-    param.buf = buf;
+    param.send_buf = param.recv_buf = buf;
     param.count = count;
     param.dtype = ccl::global_data::get().dtypes->get(dtype);
     param.root = root;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req);
@@ -853,7 +942,9 @@ ccl_request* ccl_reduce_impl(const void* send_buf,
                              int root,
                              const ccl_coll_attr& attr,
                              ccl_comm* comm,
-                             const ccl_stream* stream) {
+                             const ccl_stream* stream,
+                             const std::vector<ccl::event>& deps,
+                             bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_reduce;
@@ -865,6 +956,8 @@ ccl_request* ccl_reduce_impl(const void* send_buf,
     param.root = root;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req);
@@ -878,7 +971,9 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf,
                                      ccl::reduction reduction,
                                      const ccl_coll_attr& attr,
                                      ccl_comm* comm,
-                                     const ccl_stream* stream) {
+                                     const ccl_stream* stream,
+                                     const std::vector<ccl::event>& deps,
+                                     bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_reduce_scatter;
@@ -889,6 +984,8 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf,
     param.reduction = reduction;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     auto req = ccl_coll_create(param, attr);
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req);
@@ -908,7 +1005,9 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf,
                                        ccl::reduction reduction,
                                        const ccl_coll_attr& attr,
                                        ccl_comm* comm,
-                                       const ccl_stream* stream) {
+                                       const ccl_stream* stream,
+                                       const std::vector<ccl::event>& deps,
+                                       bool skip_validation) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_sparse_allreduce;
@@ -925,6 +1024,8 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf,
     param.reduction = reduction;
     param.stream = stream;
     param.comm = comm;
+    param.skip_validation = skip_validation;
+    copy_deps(deps, param.deps);
 
     ccl_coll_attr internal_attr(attr);
     internal_attr.to_cache = 0; /* skip to_cache flag, unsupported yet */
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index 69a8cc7ff..76601f01a 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -108,7 +108,9 @@ ccl_request* ccl_allgatherv_impl(const void* send_buf,
                                  ccl::datatype dtype,
                                  const ccl_coll_attr& attr,
                                  ccl_comm* comm,
-                                 const ccl_stream* stream);
+                                 const ccl_stream* stream,
+                                 const std::vector<ccl::event>& deps,
+                                 bool skip_validation = false);
 
 ccl_request* ccl_allreduce_impl(const void* send_buf,
                                 void* recv_buf,
@@ -117,7 +119,9 @@ ccl_request* ccl_allreduce_impl(const void* send_buf,
                                 ccl::reduction reduction,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream);
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation = false);
 template <class gpu_device_type>
 ccl_request* ccl_allreduce_gpu_impl(const void* send_buf,
                                     void* recv_buf,
@@ -126,7 +130,8 @@ ccl_request* ccl_allreduce_gpu_impl(const void* send_buf,
                                     ccl::reduction reduction,
                                     const ccl_coll_attr& attr,
                                     ccl_comm* comm,
-                                    const ccl_stream* stream);
+                                    const ccl_stream* stream,
+                                    const std::vector<ccl::event>& deps);
 
 ccl_request* ccl_alltoall_impl(const void* send_buf,
                                void* recv_buf,
@@ -134,7 +139,9 @@ ccl_request* ccl_alltoall_impl(const void* send_buf,
                                ccl::datatype dtype,
                                const ccl_coll_attr& attr,
                                ccl_comm* comm,
-                               const ccl_stream* stream);
+                               const ccl_stream* stream,
+                               const std::vector<ccl::event>& deps,
+                               bool skip_validation = false);
 
 ccl_request* ccl_alltoallv_impl(const void* send_buf,
                                 const size_t* send_counts,
@@ -143,9 +150,14 @@ ccl_request* ccl_alltoallv_impl(const void* send_buf,
                                 ccl::datatype dtype,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream);
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation = false);
 
-void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream);
+void ccl_barrier_impl(ccl_comm* comm,
+                      const ccl_stream* stream,
+                      const std::vector<ccl::event>& deps,
+                      bool skip_validation = false);
 
 ccl_request* ccl_broadcast_impl(void* buf,
                                 size_t count,
@@ -153,7 +165,9 @@ ccl_request* ccl_broadcast_impl(void* buf,
                                 int root,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
-                                const ccl_stream* stream);
+                                const ccl_stream* stream,
+                                const std::vector<ccl::event>& deps,
+                                bool skip_validation = false);
 
 ccl_request* ccl_reduce_impl(const void* send_buf,
                              void* recv_buf,
@@ -163,7 +177,9 @@ ccl_request* ccl_reduce_impl(const void* send_buf,
                              int root,
                              const ccl_coll_attr& attr,
                              ccl_comm* comm,
-                             const ccl_stream* stream);
+                             const ccl_stream* stream,
+                             const std::vector<ccl::event>& deps,
+                             bool skip_validation = false);
 
 ccl_request* ccl_reduce_scatter_impl(const void* send_buf,
                                      void* recv_buf,
@@ -172,7 +188,9 @@ ccl_request* ccl_reduce_scatter_impl(const void* send_buf,
                                      ccl::reduction reduction,
                                      const ccl_coll_attr& attr,
                                      ccl_comm* comm,
-                                     const ccl_stream* stream);
+                                     const ccl_stream* stream,
+                                     const std::vector<ccl::event>& deps,
+                                     bool skip_validation = false);
 
 ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf,
                                        size_t send_ind_count,
@@ -187,4 +205,6 @@ ccl_request* ccl_sparse_allreduce_impl(const void* send_ind_buf,
                                        ccl::reduction reduction,
                                        const ccl_coll_attr& attr,
                                        ccl_comm* comm,
-                                       const ccl_stream* stream);
+                                       const ccl_stream* stream,
+                                       const std::vector<ccl::event>& deps,
+                                       bool skip_validation = false);
diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp
new file mode 100644
index 000000000..a050cdfd1
--- /dev/null
+++ b/src/coll/coll_param.cpp
@@ -0,0 +1,68 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/coll_param.hpp"
+
+bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs) {
+    CCL_ASSERT((lhs.is_reduction() && rhs.is_reduction()) ||
+               (!lhs.is_reduction() && !rhs.is_reduction()));
+
+    bool res =
+        lhs.get_coll_type() == rhs.get_coll_type() && lhs.get_datatype() == rhs.get_datatype();
+
+    if (lhs.is_reduction()) {
+        res = res && (lhs.get_reduction() == rhs.get_reduction());
+    }
+
+    return res;
+}
+
+void copy_deps(const std::vector<ccl::event>& in, std::vector<ccl::event>& out) {
+#ifdef CCL_ENABLE_SYCL
+    out.clear();
+    for (size_t idx = 0; idx < in.size(); idx++) {
+        try {
+            auto sycl_event = in[idx].get_native();
+            out.push_back(ccl::create_event(sycl_event));
+        }
+        catch (ccl::exception&) {
+        }
+    }
+#else /* CCL_ENABLE_SYCL */
+    CCL_THROW_IF_NOT(in.size() == 0, "host deps are not supported yet");
+#endif /* CCL_ENABLE_SYCL */
+}
+
+ccl_coll_param::ccl_coll_param(const ccl_coll_param& other) {
+    ctype = other.ctype;
+    send_buf = other.send_buf;
+    recv_buf = other.recv_buf;
+    count = other.count;
+    send_count = other.send_count;
+    send_counts = other.send_counts;
+    recv_counts = other.recv_counts;
+    dtype = other.dtype;
+    reduction = other.reduction;
+    root = other.root;
+    stream = other.stream;
+    copy_deps(other.deps, deps);
+    comm = other.comm;
+    sparse_param = other.sparse_param;
+
+#ifdef CCL_ENABLE_SYCL
+    device_send_buf = other.device_send_buf;
+    device_recv_buf = other.device_recv_buf;
+#endif /* CCL_ENABLE_SYCL */
+}
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index 34b857bd5..927e0ca3f 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -17,14 +17,7 @@
 
 #include "coll/algorithms/algorithms_enum.hpp"
 #include "common/datatype/datatype.hpp"
-
-#include "oneapi/ccl/type_traits.hpp"
-#include "oneapi/ccl/stream_attr_ids.hpp"
-#include "oneapi/ccl/stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/stream.hpp"
-#include "oneapi/ccl/coll_attr_ids.hpp"
-#include "oneapi/ccl/coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/coll_attr.hpp"
+#include "oneapi/ccl.hpp"
 
 class ccl_comm;
 
@@ -44,10 +37,10 @@ using ccl_sycl_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<int8_t>
                                                  ccl_sycl_typed_buffer_t<uint32_t>,
                                                  ccl_sycl_typed_buffer_t<int64_t>,
                                                  ccl_sycl_typed_buffer_t<uint64_t>,
-                                                 ccl_sycl_typed_buffer_t<float>, //unsupported
+                                                 ccl_sycl_typed_buffer_t<uint16_t>,
                                                  ccl_sycl_typed_buffer_t<float>,
                                                  ccl_sycl_typed_buffer_t<double>,
-                                                 ccl_sycl_typed_buffer_t<float>>; //unsupported
+                                                 ccl_sycl_typed_buffer_t<uint16_t>>;
 #endif /* CCL_ENABLE_SYCL */
 
 #define CCL_INVALID_PROC_IDX (-1)
@@ -99,9 +92,10 @@ struct ccl_coll_sparse_param {
     ccl_datatype itype;
 };
 
+void copy_deps(const std::vector<ccl::event>& in, std::vector<ccl::event>& out);
+
 struct ccl_coll_param {
     ccl_coll_type ctype;
-    void* buf;
     const void* send_buf;
     void* recv_buf;
     size_t count;
@@ -112,16 +106,61 @@ struct ccl_coll_param {
     ccl::reduction reduction;
     int root;
     const ccl_stream* stream;
+    std::vector<ccl::event> deps;
     ccl_comm* comm;
     ccl_coll_sparse_param sparse_param;
+    bool skip_validation;
 
 #ifdef CCL_ENABLE_SYCL
-    ccl_sycl_buffer_t* sycl_send_buf;
-    ccl_sycl_buffer_t* sycl_recv_buf;
-    ccl_sycl_buffer_t* sycl_buf;
+    ccl_sycl_buffer_t* device_send_buf;
+    ccl_sycl_buffer_t* device_recv_buf;
 #endif /* CCL_ENABLE_SYCL */
+
+    ccl_coll_param() {}
+    ccl_coll_param(const ccl_coll_param& other);
 };
 
+class coll_param_gpu {
+    ccl_coll_type ctype;
+    ccl::datatype dtype;
+    ccl::reduction red;
+
+public:
+    coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype, ccl::reduction red)
+            : ctype{ ctype },
+              dtype{ dtype },
+              red{ red } {}
+
+    coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype)
+            : ctype{ ctype },
+              dtype{ dtype },
+              red{ (ccl::reduction)-1 } {
+        assert(!is_reduction() && "This constructor is invalid for reduction types");
+    }
+
+    ccl_coll_type get_coll_type() const {
+        return ctype;
+    }
+
+    ccl::datatype get_datatype() const {
+        return dtype;
+    }
+
+    bool is_reduction() const {
+        return ccl_coll_type_is_reduction(get_coll_type());
+    }
+
+    ccl::reduction get_reduction() const {
+        if (!is_reduction()) {
+            throw ccl::exception(
+                "get_ruduction(): is not supported for non-reduction collective type, i.e. bcast");
+        }
+        return red;
+    }
+};
+
+bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs);
+
 /*
     explicitly split coll_param and coll_param_copy
     to separate coll_param structure which is used for interaction between different modules
diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp
index ae8c8ce74..101adaf5f 100644
--- a/src/coll/selection/selector_allreduce.cpp
+++ b/src/coll/selection/selector_allreduce.cpp
@@ -61,7 +61,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
 
     if (algo == ccl_coll_allreduce_rabenseifner && (int)param.count < param.comm->pof2())
         can_use = false;
-    else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.enable_rma)
+    else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.out.enable_rma)
         can_use = false;
     else if (algo == ccl_coll_allreduce_starlike && !(param.count / param.comm->size()))
         can_use = false;
diff --git a/src/common/comm/comm_interface.hpp b/src/common/comm/comm_interface.hpp
index e0c7eaf67..02d30b0e8 100644
--- a/src/common/comm/comm_interface.hpp
+++ b/src/common/comm/comm_interface.hpp
@@ -109,6 +109,8 @@ struct gpu_comm_attr;
     COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint64_t); \
     COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, float); \
     COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, double); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, ccl::bfloat16); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, ccl::float16); \
     COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, float); \
     COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, ccl::bfloat16); \
     COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int64_t, float); \
diff --git a/src/common/comm/compiler_comm_interface_dispatcher.cpp b/src/common/comm/compiler_comm_interface_dispatcher.cpp
index cd688160b..6734322bf 100644
--- a/src/common/comm/compiler_comm_interface_dispatcher.cpp
+++ b/src/common/comm/compiler_comm_interface_dispatcher.cpp
@@ -140,7 +140,7 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
     // Use process class if not specified otherwise
     // TODO: implement a proper dispatching for other types
     if (preferred_topology_group == ccl::group_split_type::undetermined) {
-        preferred_topology_group = ccl::group_split_type::process;
+        preferred_topology_group = ccl::group_split_type::cluster;
     }
 
     // read comm split attributes
diff --git a/src/common/comm/host_communicator/host_communicator.cpp b/src/common/comm/host_communicator/host_communicator.cpp
index ad135c487..0f7c5cd9d 100644
--- a/src/common/comm/host_communicator/host_communicator.cpp
+++ b/src/common/comm/host_communicator/host_communicator.cpp
@@ -176,7 +176,7 @@ ccl::event host_communicator::barrier_impl(const ccl::stream::impl_value_t& op_s
                                            const ccl::vector_class<ccl::event>& deps) {
     // TODO what exactly we need to do with 'attr' here?
 
-    ccl_barrier_impl(comm_impl.get(), op_stream.get());
+    ccl_barrier_impl(comm_impl.get(), op_stream.get(), deps);
 
     // TODO what exactly we need to return here? ccl_barrier_impl() is void func
     ccl_request* req = nullptr;
@@ -192,8 +192,15 @@ ccl::event host_communicator::allgatherv_impl(const void* send_buf,
                                               const ccl::stream::impl_value_t& stream,
                                               const ccl::allgatherv_attr& attr,
                                               const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_allgatherv_impl(
-        send_buf, send_count, recv_buf, recv_counts.data(), dtype, attr, comm_impl.get(), nullptr);
+    ccl_request* req = ccl_allgatherv_impl(send_buf,
+                                           send_count,
+                                           recv_buf,
+                                           recv_counts.data(),
+                                           dtype,
+                                           attr,
+                                           comm_impl.get(),
+                                           nullptr,
+                                           deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -206,9 +213,20 @@ ccl::event host_communicator::allgatherv_impl(const void* send_buf,
                                               const ccl::stream::impl_value_t& stream,
                                               const ccl::allgatherv_attr& attr,
                                               const ccl::vector_class<ccl::event>& deps) {
-    // TODO not implemented
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    ccl_coll_attr internal_attr(attr);
+    internal_attr.vector_buf = 1;
+
+    ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
+                                           send_count,
+                                           (void*)(recv_bufs.data()),
+                                           recv_counts.data(),
+                                           dtype,
+                                           internal_attr,
+                                           comm_impl.get(),
+                                           nullptr,
+                                           deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 /* allreduce */
@@ -221,7 +239,7 @@ ccl::event host_communicator::allreduce_impl(const void* send_buf,
                                              const ccl::allreduce_attr& attr,
                                              const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allreduce_impl(
-        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr);
+        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr, deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -235,7 +253,7 @@ ccl::event host_communicator::alltoall_impl(const void* send_buf,
                                             const ccl::alltoall_attr& attr,
                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req =
-        ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr, comm_impl.get(), nullptr);
+        ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr, comm_impl.get(), nullptr, deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -268,7 +286,8 @@ ccl::event host_communicator::alltoallv_impl(const void* send_buf,
                                           dtype,
                                           attr,
                                           comm_impl.get(),
-                                          nullptr);
+                                          nullptr,
+                                          deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -294,7 +313,8 @@ ccl::event host_communicator::broadcast_impl(void* buf,
                                              const ccl::stream::impl_value_t& stream,
                                              const ccl::broadcast_attr& attr,
                                              const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), nullptr);
+    ccl_request* req =
+        ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), nullptr, deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -310,7 +330,7 @@ ccl::event host_communicator::reduce_impl(const void* send_buf,
                                           const ccl::reduce_attr& attr,
                                           const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_impl(
-        send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), nullptr);
+        send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), nullptr, deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -325,7 +345,7 @@ ccl::event host_communicator::reduce_scatter_impl(const void* send_buf,
                                                   const ccl::reduce_scatter_attr& attr,
                                                   const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_scatter_impl(
-        send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), nullptr);
+        send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), nullptr, deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -358,7 +378,8 @@ ccl::event host_communicator::sparse_allreduce_impl(const void* send_ind_buf,
                                                  reduction,
                                                  attr,
                                                  comm_impl.get(),
-                                                 nullptr);
+                                                 nullptr,
+                                                 deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
diff --git a/src/common/comm/host_communicator/host_communicator_impl.hpp b/src/common/comm/host_communicator/host_communicator_impl.hpp
index d71640c00..a958a117a 100644
--- a/src/common/comm/host_communicator/host_communicator_impl.hpp
+++ b/src/common/comm/host_communicator/host_communicator_impl.hpp
@@ -42,7 +42,8 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
                                            ccl::native_type_info<buffer_type>::dtype,
                                            attr,
                                            comm_impl.get(),
-                                           nullptr);
+                                           nullptr,
+                                           deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -50,7 +51,7 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
 template <class buffer_type>
 ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
                                               size_t send_count,
-                                              ccl::vector_class<buffer_type*>& recv_buf,
+                                              ccl::vector_class<buffer_type*>& recv_bufs,
                                               const ccl::vector_class<size_t>& recv_counts,
                                               const ccl::stream::impl_value_t& stream,
                                               const ccl::allgatherv_attr& attr,
@@ -60,12 +61,13 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
 
     ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
                                            send_count,
-                                           (void*)(recv_buf.data()),
+                                           (void*)(recv_bufs.data()),
                                            recv_counts.data(),
                                            ccl::native_type_info<buffer_type>::dtype,
                                            internal_attr,
                                            comm_impl.get(),
-                                           nullptr);
+                                           nullptr,
+                                           deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -113,7 +115,8 @@ ccl::event host_communicator::allreduce_impl(const buffer_type* send_buf,
                                           reduction,
                                           attr,
                                           comm_impl.get(),
-                                          nullptr);
+                                          nullptr,
+                                          deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -145,7 +148,8 @@ ccl::event host_communicator::alltoall_impl(const buffer_type* send_buf,
                                          ccl::native_type_info<buffer_type>::dtype,
                                          attr,
                                          comm_impl.get(),
-                                         nullptr);
+                                         nullptr,
+                                         deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -203,7 +207,8 @@ ccl::event host_communicator::alltoallv_impl(const buffer_type* send_buf,
                                           ccl::native_type_info<buffer_type>::dtype,
                                           attr,
                                           comm_impl.get(),
-                                          nullptr);
+                                          nullptr,
+                                          deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -261,7 +266,8 @@ ccl::event host_communicator::broadcast_impl(buffer_type* buf,
                                           root,
                                           attr,
                                           comm_impl.get(),
-                                          nullptr);
+                                          nullptr,
+                                          deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -296,7 +302,8 @@ ccl::event host_communicator::reduce_impl(const buffer_type* send_buf,
                                        root,
                                        attr,
                                        comm_impl.get(),
-                                       nullptr);
+                                       nullptr,
+                                       deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -331,7 +338,8 @@ ccl::event host_communicator::reduce_scatter_impl(const buffer_type* send_buf,
                                                reduction,
                                                attr,
                                                comm_impl.get(),
-                                               nullptr);
+                                               nullptr,
+                                               deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
@@ -376,7 +384,8 @@ ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* sen
                                                  reduction,
                                                  attr,
                                                  comm_impl.get(),
-                                                 nullptr);
+                                                 nullptr,
+                                                 deps);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
diff --git a/src/common/comm/l0/communicator/base_communicator.hpp b/src/common/comm/l0/communicator/base_communicator.hpp
index 8d03784c9..ef315d4a1 100644
--- a/src/common/comm/l0/communicator/base_communicator.hpp
+++ b/src/common/comm/l0/communicator/base_communicator.hpp
@@ -19,7 +19,6 @@
 #include "common/comm/comm_interface.hpp"
 //TODO #include "sched/gpu_sched.hpp"
 #include "common/comm/l0/comm_context_id.hpp"
-#include "common/comm/l0/modules/kernel_params.hpp"
 
 struct base_communicator : public ccl::communicator_interface {
     //TODO using group_comm_storage = native::specific_indexed_device_storage;
diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
index 286ef7802..42b8afe38 100644
--- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
@@ -86,8 +86,75 @@ ccl::event device_group_a2a_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+    int comm_rank = rank();
+    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology();
+
+    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
+    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
+    ;
+
+    device_group_scheduler::schedule_ptr schedule;
+
+    //source for collective operation is real gpu or virtual gpu
+    auto real_device_it = in_process_gpu_storage.find(comm_rank);
+    if (real_device_it != in_process_gpu_storage.end()) {
+        LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
+
+        /* TODO
+
+        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
+
+        schedule =
+            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl,
+                                                                                       real_device_it->second,send_entry_buffer,
+                                                                                       recv_entry_buffer,
+                                                                                       count,
+                                                                                       reduction);
+        */
+    }
+    else {
+        auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
+        if (virtual_device_it != virtual_process_gpu_storage.end()) {
+            LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
+            /* TODO
+
+        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
+
+        schedule =
+            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl,
+                                                                                       virtual_device_it->second,send_entry_buffer,
+                                                                                       recv_entry_buffer,
+                                                                                       count,
+                                                                                       reduction);
+        */
+        }
+    }
+
+    //if sched is not ready - send NULL
+    if (schedule) {
+        LOG_DEBUG("Device group finalized");
+    }
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_event_impl(std::move(schedule)));
 }
 
 /* alltoall */
diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
index c93cd6472..f509c5d4e 100644
--- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
@@ -87,75 +87,14 @@ ccl::event device_group_a2a_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-    int comm_rank = rank();
-    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology();
-
-    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
-    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
-    ;
-
-    device_group_scheduler::schedule_ptr schedule;
-
-    //source for collective operation is real gpu or virtual gpu
-    auto real_device_it = in_process_gpu_storage.find(comm_rank);
-    if (real_device_it != in_process_gpu_storage.end()) {
-        LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
-
-        /* TODO
-
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
-
-        schedule =
-            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl,
-                                                                                       real_device_it->second,send_entry_buffer,
-                                                                                       recv_entry_buffer,
-                                                                                       count,
-                                                                                       reduction);
-        */
-    }
-    else {
-        auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
-        if (virtual_device_it != virtual_process_gpu_storage.end()) {
-            LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
-            /* TODO
-
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
-
-        schedule =
-            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(*device_community_impl,
-                                                                                       virtual_device_it->second,send_entry_buffer,
-                                                                                       recv_entry_buffer,
-                                                                                       count,
-                                                                                       reduction);
-        */
-        }
-    }
-
-    //if sched is not ready - send NULL
-    if (schedule) {
-        LOG_DEBUG("Device group finalized");
-    }
-    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_event_impl(std::move(schedule)));
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
diff --git a/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp b/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp
index 668efce1d..93ae547cc 100644
--- a/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_communicator_utils.hpp
@@ -17,10 +17,9 @@
 #include "common/comm/l0/devices/devices_declaration.hpp"
 #include "common/comm/l0/device_community.hpp"
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm>
 struct communication_device_expander {
     template <class device_t, class... Args>
@@ -31,7 +30,7 @@ struct communication_device_expander {
         if (comm_device) {
             LOG_DEBUG("Invoke: ", comm_device->to_string());
 
-            using gpu_entry = algorithm<kernel_params, device_t, group_id>;
+            using gpu_entry = algorithm<device_t, group_id>;
 
             schedule = ctx->scheduler_impl
                            ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>(
@@ -42,10 +41,9 @@ struct communication_device_expander {
     std::unique_ptr<ccl_gpu_sched> schedule;
 };
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm,
           class... Args>
 std::unique_ptr<ccl::event_impl> do_collective_op(
@@ -55,7 +53,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     typename native::device_community_container<class_id>::element_type community,
     native::ccl_driver_context_ptr native_context,
     Args&&... args) {
-    communication_device_expander<kernel_params, group_id, class_id, algorithm> expander;
+    communication_device_expander<group_id, class_id, algorithm> expander;
     ccl_tuple_for_each_args(communication_device,
                             expander,
                             ctx,
@@ -68,66 +66,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     return std::unique_ptr<ccl::event_impl>(
         new ccl::gpu_shared_event_impl(std::move(expander.schedule)));
 }
-
-template <class buffer_type,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
-          class algorithm,
-          class... Args>
-std::unique_ptr<ccl::event_impl> do_collective_op_reductions(
-    ccl::reduction reduction,
-    native::device_variant_t<native::ccl_gpu_comm, native::ccl_virtual_gpu_comm>&
-        communication_device,
-    std::shared_ptr<native::device_group_context>& ctx,
-    typename native::device_community_container<class_id>::element_type community,
-    native::ccl_driver_context_ptr native_context,
-    Args&&... args) {
-    switch (reduction) {
-        case ccl::reduction::sum:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>,
-                group_id,
-                class_id,
-                algorithm>(
-                communication_device, ctx, community, native_context, std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::prod:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>,
-                group_id,
-                class_id,
-                algorithm>(
-                communication_device, ctx, community, native_context, std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::min:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>,
-                group_id,
-                class_id,
-                algorithm>(
-                communication_device, ctx, community, native_context, std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::max:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>,
-                group_id,
-                class_id,
-                algorithm>(
-                communication_device, ctx, community, native_context, std::forward<Args>(args)...);
-            break;
-        // TODO: make support of custom reduction in *.cl
-        // case ccl::reduction::custom:
-        //     return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>,
-        //                            group_id, class_id, algorithm>(
-        //                                                      communication_device,
-        //                                                      ctx,
-        //                                                      community,
-        //                                                      native_context,
-        //                                                      std::forward<Args>(args)...);
-        //     break;
-        default:
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                     "Obtained reduction by user is incorrect!");
-    }
-}
diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
index 4cff85f0a..95ed26cfe 100644
--- a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
@@ -87,8 +87,49 @@ ccl::event device_group_ring_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    size_t ring_index = 0;
+
+    int comm_rank = rank();
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: , ring_index: ",
+              comm_rank,
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, native::l0_allreduce_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        this->get_native_context(),
+        send_entry_buffer,
+        recv_entry_buffer,
+        count,
+        params,
+        stream);
 }
 
 /* alltoall */
diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
index ac8116162..b6dfaea7a 100644
--- a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
@@ -88,50 +88,14 @@ ccl::event device_group_ring_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    size_t ring_index = 0;
-
-    int comm_rank = rank();
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: , ring_index: ",
-              comm_rank,
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op_reductions<buffer_type,
-                                       group_id,
-                                       class_id,
-                                       native::l0_allreduce_typed_entry>(reduction,
-                                                                         communication_device,
-                                                                         ctx,
-                                                                         community,
-                                                                         this->get_native_context(),
-                                                                         send_entry_buffer,
-                                                                         recv_entry_buffer,
-                                                                         count,
-                                                                         reduction,
-                                                                         stream);
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
index 43bd9fcc0..4f37872e9 100644
--- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
@@ -79,8 +79,127 @@ ccl::event process_a2a_communicator::allreduce_impl(const void* send_buf,
                                                     const ccl::stream::impl_value_t& stream,
                                                     const ccl::allreduce_attr& attr,
                                                     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology();
+
+    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
+    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
+
+    auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>();
+    (void)ipc_gpu_storage;
+    auto& in_process_ipc_source_real_gpu_storage =
+        community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>();
+    auto& in_process_ipc_source_virtual_gpu_storage =
+        community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>();
+
+    allied_process_group_scheduler::thread_schedule_ptr schedule;
+    //source for collective operation is ipc sources, real gpu or virtual gpu
+    auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank);
+    if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) {
+        LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string());
+        /*
+        using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_ipc_source_gpu_comm<ccl_gpu_comm>,
+                                                             group_id>;
+
+        schedule =
+                ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
+                                                                                               thread_id,
+                                                                                               *device_community_impl,
+                                                                                               ipc_src_real_it->second,
+                                                                                               send_entry_buffer,
+                                                                                               recv_entry_buffer,
+                                                                                               count,
+                                                                                               dtype,
+                                                                                               reduction);
+        */
+    }
+    else {
+        auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank);
+        if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) {
+            LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string());
+            /*
+            using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>,
+                                                                group_id>;
+
+            schedule =
+                    ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
+                                                                                                thread_id,
+                                                                                            *device_community_impl,
+                                                                                            ipc_src_virt_it->second,
+                                                                                            send_entry_buffer,
+                                                                                            recv_entry_buffer,
+                                                                                            count,
+                                                                                            dtype,
+                                                                                            reduction);
+            */
+        }
+        else {
+            auto real_device_it = in_process_gpu_storage.find(comm_rank);
+            if (real_device_it != in_process_gpu_storage.end()) {
+                LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
+                /*
+                using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_gpu_comm, group_id>;
+
+                schedule =
+                        ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
+                                                                                                thread_id,
+                                                                                                *device_community_impl,
+                                                                                                real_device_it->second,send_entry_buffer,
+                                                                                                recv_entry_buffer,
+                                                                                                count,
+                                                                                                dtype,
+                                                                                                reduction);
+                */
+            }
+            else {
+                auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
+                if (virtual_device_it != virtual_process_gpu_storage.end()) {
+                    LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
+                    /*
+                    using gpu_allreduce_entry = l0_allreduce_typed_entry<ccl_virtual_gpu_comm, group_id>;
+
+                    schedule =
+                        ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
+                                                                                                thread_id,
+                                                                                                *device_community_impl,
+                                                                                                virtual_device_it->second,send_entry_buffer,
+                                                                                                recv_entry_buffer,
+                                                                                                count,
+                                                                                                dtype,
+                                                                                                reduction);
+                    */
+                }
+            }
+        }
+    }
+
+    //if sched is not ready - send NULL
+    if (schedule) {
+        LOG_DEBUG("Device group finalized");
+    }
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 /* alltoall */
diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
index 72dc9a11c..3f06af5e6 100644
--- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
@@ -83,126 +83,14 @@ ccl::event process_a2a_communicator::allreduce_impl(const buffer_type* send_buf,
                                                     const ccl::stream::impl_value_t& stream,
                                                     const ccl::allreduce_attr& attr,
                                                     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology();
-
-    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
-    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
-
-    auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>();
-    (void)ipc_gpu_storage;
-    auto& in_process_ipc_source_real_gpu_storage =
-        community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>();
-    auto& in_process_ipc_source_virtual_gpu_storage =
-        community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>();
-
-    allied_process_group_scheduler::thread_schedule_ptr schedule;
-    //source for collective operation is ipc sources, real gpu or virtual gpu
-    auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank);
-    if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) {
-        LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string());
-        /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type,
-                                                             ccl_ipc_source_gpu_comm<ccl_gpu_comm>,
-                                                             group_id>;
-
-        schedule =
-                ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
-                                                                                               thread_id,
-                                                                                               *device_community_impl,
-                                                                                               ipc_src_real_it->second,
-                                                                                               send_entry_buffer,
-                                                                                               recv_entry_buffer,
-                                                                                               count,
-                                                                                               reduction);
-*/
-    }
-    else {
-        auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank);
-        if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) {
-            LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string());
-            /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type,
-                                                             ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>,
-                                                             group_id>;
-
-        schedule =
-                ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
-                                                                                            thread_id,
-                                                                                           *device_community_impl,
-                                                                                           ipc_src_virt_it->second,
-                                                                                           send_entry_buffer,
-                                                                                           recv_entry_buffer,
-                                                                                           count,
-                                                                                           reduction);
-*/
-        }
-        else {
-            auto real_device_it = in_process_gpu_storage.find(comm_rank);
-            if (real_device_it != in_process_gpu_storage.end()) {
-                LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
-                /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
-
-        schedule =
-                ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
-                                                                                           thread_id,
-                                                                                           *device_community_impl,
-                                                                                           real_device_it->second,send_entry_buffer,
-                                                                                           recv_entry_buffer,
-                                                                                           count,
-                                                                                           reduction);
-*/
-            }
-            else {
-                auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
-                if (virtual_device_it != virtual_process_gpu_storage.end()) {
-                    LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
-                    /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
-
-
-        schedule =
-            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(process_id,
-                                                                                       thread_id,
-                                                                                       *device_community_impl,
-                                                                                       virtual_device_it->second,send_entry_buffer,
-                                                                                       recv_entry_buffer,
-                                                                                       count,
-                                                                                       reduction);
-    */
-                }
-            }
-        }
-    }
-
-    //if sched is not ready - send NULL
-    if (schedule) {
-        LOG_DEBUG("Device group finalized");
-    }
-    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
diff --git a/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp b/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp
index ac49c3984..0984c5911 100644
--- a/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_communicator_utils.hpp
@@ -17,10 +17,9 @@
 #include "common/comm/l0/devices/devices_declaration.hpp"
 #include "common/comm/l0/device_community.hpp"
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm>
 struct communication_process_device_expander {
     template <class device_t, class... Args>
@@ -33,7 +32,7 @@ struct communication_process_device_expander {
         if (comm_device) {
             LOG_DEBUG("Invoke: ", comm_device->to_string());
 
-            using gpu_entry = algorithm<kernel_params, device_t, group_id>;
+            using gpu_entry = algorithm<device_t, group_id>;
 
             schedule = ctx->scheduler_impl
                            ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>(
@@ -48,13 +47,14 @@ struct communication_process_device_expander {
     std::shared_ptr<ccl_gpu_sched> schedule;
 };
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm,
           class... Args>
 std::unique_ptr<ccl::event_impl> do_collective_op(
+    // TODO: can we avoid using device_variant here? Because it creates an instantiation of entry for each device which
+    // makes it slow to compile
     native::device_variant_t<native::ccl_gpu_comm,
                              native::ccl_virtual_gpu_comm,
                              native::ccl_ipc_source_gpu_comm<native::ccl_gpu_comm>,
@@ -70,7 +70,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     size_t thread_id,
     native::ccl_driver_context_ptr native_context,
     Args&&... args) {
-    communication_process_device_expander<kernel_params, group_id, class_id, algorithm> expander;
+    communication_process_device_expander<group_id, class_id, algorithm> expander;
     ccl_tuple_for_each_args(communication_device,
                             expander,
                             ctx,
@@ -85,97 +85,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     return std::unique_ptr<ccl::event_impl>(
         new ccl::gpu_shared_event_impl(std::move(expander.schedule)));
 }
-
-template <class buffer_type,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
-          class algorithm,
-          class... Args>
-std::unique_ptr<ccl::event_impl> do_collective_op_reductions(
-    ccl::reduction reduction,
-    native::device_variant_t<native::ccl_gpu_comm,
-                             native::ccl_virtual_gpu_comm,
-                             native::ccl_ipc_source_gpu_comm<native::ccl_gpu_comm>,
-                             native::ccl_ipc_source_gpu_comm<native::ccl_virtual_gpu_comm>,
-                             native::ccl_numa_proxy<native::ccl_gpu_comm>,
-                             native::ccl_numa_proxy<native::ccl_virtual_gpu_comm>,
-                             native::ccl_scaleout_proxy<native::ccl_gpu_comm>,
-                             native::ccl_scaleout_proxy<native::ccl_virtual_gpu_comm>>&
-        communication_device,
-    std::shared_ptr<native::process_group_context>& ctx,
-    typename native::device_community_container<class_id>::element_type community,
-    size_t process_id,
-    size_t thread_id,
-    native::ccl_driver_context_ptr native_context,
-    Args&&... args) {
-    switch (reduction) {
-        case ccl::reduction::sum:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           process_id,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::prod:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           process_id,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::min:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           process_id,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::max:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           process_id,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        // TODO: make support of custom reduction in *.cl
-        // case ccl::reduction::custom:
-        //     return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>,
-        //                            group_id, class_id, algorithm>(
-        //                                                      communication_device,
-        //                                                      ctx,
-        //                                                      community,
-        //                                                      process_id,
-        //                                                      thread_id,
-        //                                                      native_context,
-        //                                                      std::forward<Args>(args)...);
-        //     break;
-        default:
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                     "Obtained reduction by user is incorrect!");
-    }
-}
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
index 5f96958cd..4c79f883e 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
@@ -62,8 +62,52 @@ ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf,
                                                       const ccl::stream::impl_value_t& stream,
                                                       const ccl::allgatherv_attr& attr,
                                                       const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 send_count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, send_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_allgatherv, dtype);
+
+    return do_collective_op<group_id, class_id, l0_allgatherv_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        process_id,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        send_count,
+        recv_entry_buffer,
+        recv_counts.data(),
+        params,
+        stream);
 }
 ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf,
                                                       size_t send_count,
@@ -87,8 +131,52 @@ ccl::event process_ring_communicator::allreduce_impl(const void* send_buf,
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::allreduce_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index: ",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    // TODO: we can get dtype value from buffer_type template, no need to introduce a new parameter
+    const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_allreduce_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        process_id,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        recv_entry_buffer,
+        count,
+        params,
+        stream);
 }
 
 /* alltoall */
@@ -122,8 +210,55 @@ ccl::event process_ring_communicator::alltoallv_impl(const void* send_buf,
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::alltoallv_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+    size_t total_send_counts = std::accumulate(std::begin(send_counts), std::end(send_counts), 0);
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 total_send_counts * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+
+    size_t total_recv_counts = std::accumulate(std::begin(recv_counts), std::end(recv_counts), 0);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, total_recv_counts * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_alltoallv, dtype);
+
+    return do_collective_op<group_id, class_id, l0_alltoallv_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        process_id,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        send_counts.data(),
+        total_send_counts,
+        recv_entry_buffer,
+        recv_counts.data(),
+        total_recv_counts,
+        params,
+        stream);
 }
 ccl::event process_ring_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
                                                      const ccl::vector_class<size_t>& send_counts,
@@ -146,8 +281,46 @@ ccl::event process_ring_communicator::broadcast_impl(void* buf,
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::broadcast_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer entry_buffer(
+        &buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_bcast, dtype);
+
+    return do_collective_op<group_id, class_id, l0_bcast_typed_entry>(communication_device,
+                                                                      ctx,
+                                                                      community,
+                                                                      process_id,
+                                                                      thread_id,
+                                                                      this->get_native_context(),
+                                                                      entry_buffer,
+                                                                      count,
+                                                                      root,
+                                                                      params,
+                                                                      stream);
 }
 
 /* reduce */
@@ -160,8 +333,52 @@ ccl::event process_ring_communicator::reduce_impl(const void* send_buf,
                                                   const ccl::stream::impl_value_t& stream,
                                                   const ccl::reduce_attr& attr,
                                                   const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_reduce_typed_entry>(communication_device,
+                                                                       ctx,
+                                                                       community,
+                                                                       process_id,
+                                                                       thread_id,
+                                                                       this->get_native_context(),
+                                                                       send_entry_buffer,
+                                                                       recv_entry_buffer,
+                                                                       count,
+                                                                       reduction,
+                                                                       root,
+                                                                       params,
+                                                                       stream);
 }
 
 /* reduce_scatter */
@@ -174,8 +391,51 @@ ccl::event process_ring_communicator::reduce_scatter_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 recv_count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, recv_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_reduce_scatter, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_reduce_scatter_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        process_id,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        recv_entry_buffer,
+        recv_count,
+        params,
+        stream);
 }
 
 /* sparse_allreduce */
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
index ecb1c7377..efbe5c801 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
@@ -15,6 +15,7 @@
 */
 #pragma once
 #include "common/comm/l0/communicator/typed_base_communicator.hpp"
+#include "common/comm/usm_visitor/usm_visitors.hpp"
 
 namespace native {
 struct process_group_context;
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
index b9a59d522..889cb32a3 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
@@ -33,8 +33,14 @@ ccl::event process_ring_communicator::allgatherv_impl(const buffer_type* send_bu
                                                       const ccl::stream::impl_value_t& stream,
                                                       const ccl::allgatherv_attr& attr,
                                                       const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    return allgatherv_impl(static_cast<const void*>(send_buf),
+                           send_count,
+                           static_cast<void*>(recv_buf),
+                           recv_counts,
+                           ccl::native_type_info<buffer_type>::dtype,
+                           stream,
+                           attr,
+                           deps);
 }
 
 template <class buffer_type>
@@ -84,166 +90,14 @@ ccl::event process_ring_communicator::allreduce_impl(const buffer_type* send_buf
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::allreduce_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index: ",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op_reductions<buffer_type, group_id, class_id, l0_allreduce_typed_entry>(
-        reduction,
-        communication_device,
-        ctx,
-        community,
-        process_id,
-        thread_id,
-        this->get_native_context(),
-        send_entry_buffer,
-        recv_entry_buffer,
-        count,
-        reduction,
-        stream);
-
-    /*
-
-    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
-    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
-
-    auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>();
-    (void)ipc_gpu_storage;
-    auto& in_process_ipc_source_real_gpu_storage =
-        community->get_devices<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>();
-    auto& in_process_ipc_source_virtual_gpu_storage =
-        community->get_devices<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>();
-
-    allied_process_group_scheduler::thread_schedule_ptr schedule;
-    //source for collective operation is ipc sources, real gpu or virtual gpu
-    auto ipc_src_real_it = in_process_ipc_source_real_gpu_storage.find(comm_rank);
-    if (ipc_src_real_it != in_process_ipc_source_real_gpu_storage.end()) {
-        LOG_DEBUG("Invoke: ", ipc_src_real_it->second->to_string());
-
-        using gpu_allreduce_entry =
-            l0_allreduce_typed_entry<buffer_type, ccl_ipc_source_gpu_comm<ccl_gpu_comm>, group_id>;
-
-        schedule =
-            ctx->scheduler_impl
-                ->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>(
-                    process_id,
-                    thread_id,
-                    *community,
-                    ipc_src_real_it->second,
-                    this->get_native_context(),
-                    send_entry_buffer,
-                    recv_entry_buffer,
-                    count,
-                    reduction,
-                    stream);
-    }
-    else {
-        auto ipc_src_virt_it = in_process_ipc_source_virtual_gpu_storage.find(comm_rank);
-        if (ipc_src_virt_it != in_process_ipc_source_virtual_gpu_storage.end()) {
-            LOG_DEBUG("Invoke: ", ipc_src_virt_it->second->to_string());
-
-            using gpu_allreduce_entry =
-                l0_allreduce_typed_entry<buffer_type,
-                                         ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>,
-                                         group_id>;
-
-            schedule =
-                ctx->scheduler_impl
-                    ->submit_entry_ipc<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>(
-                        process_id,
-                        thread_id,
-                        *community,
-                        ipc_src_virt_it->second,
-                        this->get_native_context(),
-                        send_entry_buffer,
-                        recv_entry_buffer,
-                        count,
-                        reduction,
-                        stream);
-        }
-        else {
-            auto real_device_it = in_process_gpu_storage.find(comm_rank);
-            if (real_device_it != in_process_gpu_storage.end()) {
-                LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
-
-                using gpu_allreduce_entry =
-                    l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
-
-                schedule =
-                    ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry,
-                                                          ccl_sched_add_back,
-                                                          group_id,
-                                                          class_id>(process_id,
-                                                                    thread_id,
-                                                                    *community,
-                                                                    real_device_it->second,
-                                                                    this->get_native_context(),
-                                                                    send_entry_buffer,
-                                                                    recv_entry_buffer,
-                                                                    count,
-                                                                    reduction,
-                                                                    stream);
-            }
-            else {
-                auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
-                if (virtual_device_it != virtual_process_gpu_storage.end()) {
-                    LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
-                    using gpu_allreduce_entry =
-                        l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
-
-                    schedule =
-                        ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry,
-                                                              ccl_sched_add_back,
-                                                              group_id,
-                                                              class_id>(process_id,
-                                                                        thread_id,
-                                                                        *community,
-                                                                        virtual_device_it->second,
-                                                                        this->get_native_context(),
-                                                                        send_entry_buffer,
-                                                                        recv_entry_buffer,
-                                                                        count,
-                                                                        reduction,
-                                                                        stream);
-                }
-            }
-        }
-    }
-
-    //if sched is not ready - send NULL
-    if (schedule) {
-        LOG_DEBUG("Device group finalized");
-    }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
-    */
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
@@ -313,9 +167,16 @@ ccl::event process_ring_communicator::alltoallv_impl(const buffer_type* send_buf
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::alltoallv_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    return alltoallv_impl(static_cast<const void*>(send_buf),
+                          send_counts,
+                          static_cast<void*>(recv_buf),
+                          recv_counts,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          stream,
+                          attr,
+                          deps);
 }
+
 template <class buffer_type>
 ccl::event process_ring_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
@@ -363,8 +224,13 @@ ccl::event process_ring_communicator::broadcast_impl(buffer_type* buf,
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::broadcast_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    return broadcast_impl(static_cast<void*>(buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          root,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
@@ -388,8 +254,15 @@ ccl::event process_ring_communicator::reduce_impl(const buffer_type* send_buf,
                                                   const ccl::stream::impl_value_t& stream,
                                                   const ccl::reduce_attr& attr,
                                                   const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    return reduce_impl(static_cast<const void*>(send_buf),
+                       static_cast<void*>(recv_buf),
+                       count,
+                       ccl::native_type_info<buffer_type>::dtype,
+                       reduction,
+                       root,
+                       stream,
+                       attr,
+                       deps);
 }
 
 template <class buffer_type>
@@ -414,8 +287,14 @@ ccl::event process_ring_communicator::reduce_scatter_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    return reduce_scatter_impl(static_cast<const void*>(send_buf),
+                               static_cast<void*>(recv_buf),
+                               recv_count,
+                               ccl::native_type_info<buffer_type>::dtype,
+                               reduction,
+                               stream,
+                               attr,
+                               deps);
 }
 template <class buffer_type>
 ccl::event process_ring_communicator::reduce_scatter_impl(
diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
index 416d23ecc..52b714ecd 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
@@ -97,8 +97,78 @@ ccl::event thread_device_group_a2a_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology();
+
+    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
+    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
+
+    auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>();
+    (void)ipc_gpu_storage;
+
+    thread_group_scheduler::thread_schedule_ptr schedule;
+    //source for collective operation is real gpu or virtual gpu
+    auto real_device_it = in_process_gpu_storage.find(comm_rank);
+    if (real_device_it != in_process_gpu_storage.end()) {
+        LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
+        /*
+        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
+
+        schedule =
+                ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id,
+                                                                                           *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id),
+                                                                                           real_device_it->second,send_entry_buffer,
+                                                                                           recv_entry_buffer,
+                                                                                           count,
+                                                                                           reduction);
+        */
+    }
+    else {
+        auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
+        if (virtual_device_it != virtual_process_gpu_storage.end()) {
+            LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
+            /*
+        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
+
+
+        schedule =
+            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id,
+                                                                                       *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id),
+                                                                                       virtual_device_it->second, send_entry_buffer,
+                                                                                       recv_entry_buffer,
+                                                                                       count,
+                                                                                       reduction);
+        */
+        }
+    }
+
+    //if sched is not ready - send NULL
+    if (schedule) {
+        LOG_DEBUG("Device group finalized");
+    }
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 /* alltoall */
diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
index ba6f63ef5..54bf28168 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
@@ -100,78 +100,14 @@ ccl::event thread_device_group_a2a_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology();
-
-    const auto& in_process_gpu_storage = community->get_devices<ccl_gpu_comm>();
-    const auto& virtual_process_gpu_storage = community->get_devices<ccl_virtual_gpu_comm>();
-
-    auto& ipc_gpu_storage = community->get_devices<ccl_ipc_gpu_comm>();
-    (void)ipc_gpu_storage;
-
-    thread_group_scheduler::thread_schedule_ptr schedule;
-    //source for collective operation is real gpu or virtual gpu
-    auto real_device_it = in_process_gpu_storage.find(comm_rank);
-    if (real_device_it != in_process_gpu_storage.end()) {
-        LOG_DEBUG("Invoke: ", real_device_it->second->to_string());
-        /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
-
-        schedule =
-                ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id,
-                                                                                           *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id),
-                                                                                           real_device_it->second,send_entry_buffer,
-                                                                                           recv_entry_buffer,
-                                                                                           count,
-                                                                                           reduction);
-        */
-    }
-    else {
-        auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
-        if (virtual_device_it != virtual_process_gpu_storage.end()) {
-            LOG_DEBUG("Invoke: ", virtual_device_it->second->to_string());
-            /*
-        using gpu_allreduce_entry = l0_allreduce_typed_entry<buffer_type, ccl_virtual_gpu_comm, group_id>;
-
-
-        schedule =
-            ctx->scheduler_impl->submit_entry<gpu_allreduce_entry, ccl_sched_add_back>(thread_id,
-                                                                                       *ctx->get_thread_topology<thread_device_group_ring_communicator::topology_class()>(thread_id),
-                                                                                       virtual_device_it->second, send_entry_buffer,
-                                                                                       recv_entry_buffer,
-                                                                                       count,
-                                                                                       reduction);
-        */
-        }
-    }
-
-    //if sched is not ready - send NULL
-    if (schedule) {
-        LOG_DEBUG("Device group finalized");
-    }
-    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 /* alltoall */
diff --git a/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp b/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp
index b7ebb7da7..1e8123435 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp
@@ -17,10 +17,9 @@
 #include "common/comm/l0/devices/devices_declaration.hpp"
 #include "common/comm/l0/device_community.hpp"
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm>
 struct communication_thread_device_expander {
     template <class device_t, class... Args>
@@ -32,7 +31,7 @@ struct communication_thread_device_expander {
         if (comm_device) {
             LOG_DEBUG("Invoke: ", comm_device->to_string());
 
-            using gpu_entry = algorithm<kernel_params, device_t, group_id>;
+            using gpu_entry = algorithm<device_t, group_id>;
 
             schedule = ctx->scheduler_impl
                            ->submit_entry<gpu_entry, ccl_sched_add_back, group_id, class_id>(
@@ -43,10 +42,9 @@ struct communication_thread_device_expander {
     std::shared_ptr<ccl_gpu_sched> schedule;
 };
 
-template <class kernel_params,
-          ccl::group_split_type group_id,
+template <ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
+          template <class, ccl::group_split_type>
           class algorithm,
           class... Args>
 std::unique_ptr<ccl::event_impl> do_collective_op(
@@ -57,7 +55,7 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     size_t thread_id,
     native::ccl_driver_context_ptr native_context,
     Args&&... args) {
-    communication_thread_device_expander<kernel_params, group_id, class_id, algorithm> expander;
+    communication_thread_device_expander<group_id, class_id, algorithm> expander;
     ccl_tuple_for_each_args(communication_device,
                             expander,
                             ctx,
@@ -71,84 +69,3 @@ std::unique_ptr<ccl::event_impl> do_collective_op(
     return std::unique_ptr<ccl::event_impl>(
         new ccl::gpu_shared_event_impl(std::move(expander.schedule)));
 }
-
-template <class buffer_type,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          template <class, class, ccl::group_split_type>
-          class algorithm,
-          class... Args>
-std::unique_ptr<ccl::event_impl> do_collective_op_reductions(
-    ccl::reduction reduction,
-    native::device_variant_t<native::ccl_gpu_comm, native::ccl_virtual_gpu_comm>&
-        communication_device,
-    std::shared_ptr<native::thread_group_context>& ctx,
-    typename native::device_community_container<class_id>::element_type community,
-    size_t thread_id,
-    native::ccl_driver_context_ptr native_context,
-    Args&&... args) {
-    switch (reduction) {
-        case ccl::reduction::sum:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::sum>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::prod:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::prod>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::min:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::min>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        case ccl::reduction::max:
-            return do_collective_op<
-                kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::max>,
-                group_id,
-                class_id,
-                algorithm>(communication_device,
-                           ctx,
-                           community,
-                           thread_id,
-                           native_context,
-                           std::forward<Args>(args)...);
-            break;
-        // TODO: make support of custom reduction in *.cl
-        // case ccl::reduction::custom:
-        //     return do_collective_op<kernel_reduction_params_traits<buffer_type, ccl_coll_reduction::custom>,
-        //                            group_id, class_id, algorithm>(
-        //                                                      communication_device,
-        //                                                      ctx,
-        //                                                      community,
-        //                                                      thread_id,
-        //                                                      native_context,
-        //                                                      std::forward<Args>(args)...);
-        //     break;
-        default:
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                     "Obtained reduction by user is incorrect!");
-    }
-}
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
index a379ac937..a64625156 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
@@ -71,10 +71,51 @@ ccl::event thread_device_group_ring_communicator::allgatherv_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allgatherv_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    allgather_visitor_t::visit(
-        req, dtype, send_buf, send_count, recv_buf, recv_counts, stream, attr, deps);
-    return req;
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 send_count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, send_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    coll_param_gpu params(ccl_coll_allgatherv, dtype);
+
+    return do_collective_op<group_id, class_id, l0_allgatherv_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        send_count,
+        recv_entry_buffer,
+        recv_counts.data(),
+        params,
+        stream);
 }
 ccl::event thread_device_group_ring_communicator::allgatherv_impl(
     const void* send_buf,
@@ -100,10 +141,50 @@ ccl::event thread_device_group_ring_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    allreduce_visitor_t::visit(
-        req, dtype, send_buf, recv_buf, count, reduction, stream, attr, deps);
-    return req;
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    const coll_param_gpu params(ccl_coll_allreduce, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_allreduce_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        recv_entry_buffer,
+        count,
+        params,
+        stream);
 }
 
 /* alltoall */
@@ -115,9 +196,8 @@ ccl::event thread_device_group_ring_communicator::alltoall_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::alltoall_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    alltoall_visitor_t::visit(req, dtype, send_buf, recv_buf, count, stream, attr, deps);
-    return req;
+    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+    return {};
 }
 ccl::event thread_device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<void*>& send_buf,
@@ -141,10 +221,55 @@ ccl::event thread_device_group_ring_communicator::alltoallv_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::alltoallv_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    alltoallv_visitor_t::visit(
-        req, dtype, send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps);
-    return req;
+    using namespace native;
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    size_t total_send_counts = std::accumulate(std::begin(send_counts), std::end(send_counts), 0);
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 total_send_counts * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+
+    size_t total_recv_counts = std::accumulate(std::begin(recv_counts), std::end(recv_counts), 0);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, total_recv_counts * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    coll_param_gpu params(ccl_coll_alltoallv, dtype);
+
+    return do_collective_op<group_id, class_id, l0_alltoallv_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        send_counts.data(),
+        total_send_counts,
+        recv_entry_buffer,
+        recv_counts.data(),
+        total_recv_counts,
+        params,
+        stream);
 }
 ccl::event thread_device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<void*>& send_buf,
@@ -169,9 +294,45 @@ ccl::event thread_device_group_ring_communicator::broadcast_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    broadcast_visitor_t::visit(req, dtype, buf, count, root, stream, attr, deps);
-    return req;
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer entry_buffer(
+        &buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    coll_param_gpu params(ccl_coll_bcast, dtype);
+
+    return do_collective_op<group_id, class_id, l0_bcast_typed_entry>(communication_device,
+                                                                      ctx,
+                                                                      community,
+                                                                      thread_id,
+                                                                      this->get_native_context(),
+                                                                      entry_buffer,
+                                                                      count,
+                                                                      root,
+                                                                      params,
+                                                                      stream);
 }
 
 /* reduce */
@@ -185,10 +346,51 @@ ccl::event thread_device_group_ring_communicator::reduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    reduce_visitor_t::visit(
-        req, dtype, send_buf, recv_buf, count, reduction, root, stream, attr, deps);
-    return req;
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    coll_param_gpu params(ccl_coll_reduce, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_reduce_typed_entry>(communication_device,
+                                                                       ctx,
+                                                                       community,
+                                                                       thread_id,
+                                                                       this->get_native_context(),
+                                                                       send_entry_buffer,
+                                                                       recv_entry_buffer,
+                                                                       count,
+                                                                       reduction,
+                                                                       root,
+                                                                       params,
+                                                                       stream);
 }
 
 /* reduce_scatter */
@@ -201,10 +403,50 @@ ccl::event thread_device_group_ring_communicator::reduce_scatter_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    ccl::event req;
-    reduce_scatter_visitor_t::visit(
-        req, dtype, send_buf, recv_buf, recv_count, reduction, stream, attr, deps);
-    return req;
+    using namespace native;
+
+    static constexpr ccl::group_split_type group_id = base_t::topology_type();
+    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
+
+    if (!is_ready()) {
+        throw ccl::exception(std::string(
+            "Device communicator for group_id: " + ::to_string(group_id) +
+            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
+    }
+
+    int comm_rank = rank();
+    size_t ring_index = 0;
+    LOG_DEBUG("communicator for device idx: ",
+              get_device_path(),
+              ", rank idx: ",
+              comm_rank,
+              ", ring_index :",
+              ring_index);
+
+    //TODO make const!
+    ccl_buffer send_entry_buffer(const_cast<void**>(&send_buf),
+                                 recv_count * ccl::get_datatype_size(dtype),
+                                 0,
+                                 ccl_buffer_type::INDIRECT);
+    ccl_buffer recv_entry_buffer(
+        &recv_buf, recv_count * ccl::get_datatype_size(dtype), 0, ccl_buffer_type::INDIRECT);
+
+    using community_t = typename device_community_container<class_id>::element_type;
+    community_t community = device_community_impl.get_topology(ring_index);
+
+    coll_param_gpu params(ccl_coll_reduce_scatter, dtype, reduction);
+
+    return do_collective_op<group_id, class_id, l0_reduce_scatter_typed_entry>(
+        communication_device,
+        ctx,
+        community,
+        thread_id,
+        this->get_native_context(),
+        send_entry_buffer,
+        recv_entry_buffer,
+        recv_count,
+        params,
+        stream);
 }
 
 /* sparse_allreduce */
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
index ac8cdc425..04b93440a 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
@@ -26,14 +26,7 @@ class thread_device_group_ring_communicator
         : public typed_base_communicator<thread_device_group_ring_communicator,
                                          ccl::group_split_type::process,
                                          ccl::device_topology_type::ring,
-                                         ccl::gpu_communicator_traits>,
-          public allgather_usm_visitor<thread_device_group_ring_communicator>,
-          public allreduce_usm_visitor<thread_device_group_ring_communicator>,
-          public alltoall_usm_visitor<thread_device_group_ring_communicator>,
-          public alltoallv_usm_visitor<thread_device_group_ring_communicator>,
-          public broadcast_usm_visitor<thread_device_group_ring_communicator>,
-          public reduce_usm_visitor<thread_device_group_ring_communicator>,
-          public reduce_scatter_usm_visitor<thread_device_group_ring_communicator> {
+                                         ccl::gpu_communicator_traits> {
 public:
     using base_t = typed_base_communicator<thread_device_group_ring_communicator,
                                            ccl::group_split_type::process,
@@ -46,15 +39,6 @@ class thread_device_group_ring_communicator
                                                              native::ccl_numa_proxy<native::ccl_gpu_comm>,
                                                              native::ccl_numa_proxy<native::ccl_virtual_gpu_comm>*/>;
 
-    using allgather_visitor_t = allgather_usm_visitor<thread_device_group_ring_communicator>;
-    using allreduce_visitor_t = allreduce_usm_visitor<thread_device_group_ring_communicator>;
-    using alltoall_visitor_t = alltoall_usm_visitor<thread_device_group_ring_communicator>;
-    using alltoallv_visitor_t = alltoallv_usm_visitor<thread_device_group_ring_communicator>;
-    using broadcast_visitor_t = broadcast_usm_visitor<thread_device_group_ring_communicator>;
-    using reduce_visitor_t = reduce_usm_visitor<thread_device_group_ring_communicator>;
-    using reduce_scatter_visitor_t =
-        reduce_scatter_usm_visitor<thread_device_group_ring_communicator>;
-
     thread_device_group_ring_communicator(ccl::unified_device_type&& device,
                                           ccl::unified_context_type&& ctx,
                                           size_t thread_idx,
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
index 880ef5b23..c2b646d4f 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
@@ -21,6 +21,7 @@
 #include "common/comm/l0/devices/devices_declaration.hpp"
 #include "common/comm/l0/device_community.hpp"
 #include "common/comm/l0/context/thread_group_ctx.hpp"
+// TODO: try to move to cpp file as we now only reference l0_entries from there
 #include "common/comm/l0/scheduler/thread_group_scheduler.hpp"
 #include "common/event/impls/gpu_event.hpp"
 #include "common/comm/l0/communicator/thread_group/thread_communicator_utils.hpp"
@@ -75,50 +76,14 @@ ccl::event thread_device_group_ring_communicator::allgatherv_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allgatherv_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 send_count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, send_count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op<kernel_params_default<buffer_type>,
-                            group_id,
-                            class_id,
-                            l0_allgatherv_typed_entry>(communication_device,
-                                                       ctx,
-                                                       community,
-                                                       thread_id,
-                                                       this->get_native_context(),
-                                                       send_entry_buffer,
-                                                       send_count,
-                                                       recv_entry_buffer,
-                                                       recv_counts.data(),
-                                                       stream);
+    return allgatherv_impl(static_cast<const void*>(send_buf),
+                           send_count,
+                           static_cast<void*>(recv_buf),
+                           recv_counts,
+                           ccl::native_type_info<buffer_type>::dtype,
+                           stream,
+                           attr,
+                           deps);
 }
 
 /* allreduce */
@@ -131,49 +96,14 @@ ccl::event thread_device_group_ring_communicator::allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op_reductions<buffer_type, group_id, class_id, l0_allreduce_typed_entry>(
-        reduction,
-        communication_device,
-        ctx,
-        community,
-        thread_id,
-        this->get_native_context(),
-        send_entry_buffer,
-        recv_entry_buffer,
-        count,
-        reduction,
-        stream);
+    return allreduce_impl(static_cast<const void*>(send_buf),
+                          static_cast<void*>(recv_buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
@@ -285,49 +215,14 @@ ccl::event thread_device_group_ring_communicator::alltoallv_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::alltoallv_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-    size_t SIZE = 512;
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 SIZE * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, SIZE * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op<kernel_params_default<buffer_type>,
-                            group_id,
-                            class_id,
-                            l0_alltoallv_typed_entry>(communication_device,
-                                                      ctx,
-                                                      community,
-                                                      thread_id,
-                                                      this->get_native_context(),
-                                                      send_entry_buffer,
-                                                      send_counts.data(),
-                                                      recv_entry_buffer,
-                                                      recv_counts.data(),
-                                                      stream);
+    return alltoallv_impl(static_cast<const void*>(send_buf),
+                          send_counts,
+                          static_cast<void*>(recv_buf),
+                          recv_counts,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          stream,
+                          attr,
+                          deps);
 }
 
 /* bcast */
@@ -339,44 +234,13 @@ ccl::event thread_device_group_ring_communicator::broadcast_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer entry_buffer(&buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op<kernel_params_default<buffer_type>,
-                            group_id,
-                            class_id,
-                            l0_bcast_typed_entry>(communication_device,
-                                                  ctx,
-                                                  community,
-                                                  thread_id,
-                                                  this->get_native_context(),
-                                                  entry_buffer,
-                                                  count,
-                                                  root,
-                                                  stream);
+    return broadcast_impl(static_cast<void*>(buf),
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          root,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
@@ -402,50 +266,15 @@ ccl::event thread_device_group_ring_communicator::reduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op_reductions<buffer_type, group_id, class_id, l0_reduce_typed_entry>(
-        reduction,
-        communication_device,
-        ctx,
-        community,
-        thread_id,
-        this->get_native_context(),
-        send_entry_buffer,
-        recv_entry_buffer,
-        count,
-        reduction,
-        root,
-        stream);
+    return reduce_impl(static_cast<const void*>(send_buf),
+                       static_cast<void*>(recv_buf),
+                       count,
+                       ccl::native_type_info<buffer_type>::dtype,
+                       reduction,
+                       root,
+                       stream,
+                       attr,
+                       deps);
 }
 
 template <class buffer_type>
@@ -472,51 +301,14 @@ ccl::event thread_device_group_ring_communicator::reduce_scatter_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Device communicator for group_id: " + ::to_string(group_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    int comm_rank = rank();
-    size_t ring_index = 0;
-    LOG_DEBUG("communicator for device idx: ",
-              get_device_path(),
-              ", rank idx: ",
-              comm_rank,
-              ", ring_index :",
-              ring_index);
-
-    //TODO make const!
-    ccl_buffer send_entry_buffer(const_cast<buffer_type**>(&send_buf),
-                                 recv_count * sizeof(buffer_type),
-                                 0,
-                                 ccl_buffer_type::INDIRECT);
-    ccl_buffer recv_entry_buffer(
-        &recv_buf, recv_count * sizeof(buffer_type), 0, ccl_buffer_type::INDIRECT);
-
-    using community_t = typename device_community_container<class_id>::element_type;
-    community_t community = device_community_impl.get_topology(ring_index);
-
-    return do_collective_op_reductions<buffer_type,
-                                       group_id,
-                                       class_id,
-                                       l0_reduce_scatter_typed_entry>(reduction,
-                                                                      communication_device,
-                                                                      ctx,
-                                                                      community,
-                                                                      thread_id,
-                                                                      this->get_native_context(),
-                                                                      send_entry_buffer,
-                                                                      recv_entry_buffer,
-                                                                      recv_count,
-                                                                      reduction,
-                                                                      stream);
+    return reduce_scatter_impl(static_cast<const void*>(send_buf),
+                               static_cast<void*>(recv_buf),
+                               recv_count,
+                               ccl::native_type_info<buffer_type>::dtype,
+                               reduction,
+                               stream,
+                               attr,
+                               deps);
 }
 
 template <class buffer_type>
diff --git a/src/common/comm/l0/context/base_ctx_actor.hpp b/src/common/comm/l0/context/base_ctx_actor.hpp
index f8c773fb7..d7425de3c 100644
--- a/src/common/comm/l0/context/base_ctx_actor.hpp
+++ b/src/common/comm/l0/context/base_ctx_actor.hpp
@@ -36,6 +36,7 @@ struct actor {
     actor(key_t actor_id, Function&& f, Args&&... args)
             : function(std::bind(std::forward<Function>(f),
                                  std::forward<Args>(args)...,
+                                 this,
                                  std::placeholders::_1)),
               stop(false),
               processing(&actor<message_type>::run, this),
@@ -61,6 +62,17 @@ struct actor {
         }
     }
 
+protected:
+    template <class Derived, class Function, class... Args>
+    actor(Derived* child, key_t actor_id, Function&& f, Args&&... args)
+            : function(std::bind(std::forward<Function>(f),
+                                 std::forward<Args>(args)...,
+                                 child,
+                                 std::placeholders::_1)),
+              stop(false),
+              processing(&actor<message_type>::run, this),
+              id(actor_id) {}
+
 private:
     core_t function;
     storage_t messages;
@@ -72,8 +84,8 @@ struct actor {
     key_t id;
 
     virtual void run() {
+        storage_t to_do_list;
         while (!stop.load()) {
-            storage_t to_do_list;
             {
                 std::unique_lock<std::mutex> lk(mutex);
                 condition.wait(lk, [this]() {
@@ -105,7 +117,7 @@ struct subscribed_actor : public actor<message_type> {
 
     template <class Function, class... Args>
     subscribed_actor(key_t actor_id, Function&& f, Args&&... args)
-            : base_t(actor_id, std::forward<Function>(f), std::forward<Args>(args)..., this) {}
+            : base_t(this, actor_id, std::forward<Function>(f), std::forward<Args>(args)...) {}
 
     virtual ~subscribed_actor() {}
 
diff --git a/src/common/comm/l0/context/base_scaling_ctx.hpp b/src/common/comm/l0/context/base_scaling_ctx.hpp
index 97394a071..d855c8219 100644
--- a/src/common/comm/l0/context/base_scaling_ctx.hpp
+++ b/src/common/comm/l0/context/base_scaling_ctx.hpp
@@ -34,6 +34,9 @@ namespace observer {
 template <class device_t, class actor_t>
 using device_thread_map = std::map<device_t*, std::unique_ptr<actor_t>>;
 
+template <class actor_t, class... devices_types>
+using multiple_device_thread_map_t = std::tuple<device_thread_map<devices_types, actor_t>...>;
+
 template <class device_t>
 using proxy_observer_ptr = typename std::add_pointer<device_t>::type;
 
diff --git a/src/common/comm/l0/context/device_group_ctx.cpp b/src/common/comm/l0/context/device_group_ctx.cpp
index 9cf32b832..b6746911f 100644
--- a/src/common/comm/l0/context/device_group_ctx.cpp
+++ b/src/common/comm/l0/context/device_group_ctx.cpp
@@ -16,7 +16,7 @@
 #include <sstream>
 
 #include "common/comm/l0/devices/devices_declaration.hpp"
-#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
 #include "common/comm/l0/context/device_group_ctx.hpp"
 #include "common/comm/l0/context/device_storage.hpp"
 #include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
diff --git a/src/common/comm/l0/context/device_group_ctx.hpp b/src/common/comm/l0/context/device_group_ctx.hpp
index 9c814ee2c..f0fa9ddc0 100644
--- a/src/common/comm/l0/context/device_group_ctx.hpp
+++ b/src/common/comm/l0/context/device_group_ctx.hpp
@@ -22,7 +22,7 @@
 #include "oneapi/ccl/types.hpp"
 #include "supported_topologies.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
 #include "common/comm/l0/device_community_holder_impl.hpp"
 
 class device_group_router;
diff --git a/src/common/comm/l0/context/process_group_ctx.cpp b/src/common/comm/l0/context/process_group_ctx.cpp
index b178c1071..7c7d00c5d 100644
--- a/src/common/comm/l0/context/process_group_ctx.cpp
+++ b/src/common/comm/l0/context/process_group_ctx.cpp
@@ -35,10 +35,10 @@
 #include "common/comm/l0/scheduler/allied_process_group_scheduler.hpp"
 
 #include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp"
 
 namespace native {
 
@@ -151,19 +151,15 @@ bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_
         detail::adjacency_matrix p2p_dependency_graph =
             ally_process_topology.build_p2p_capability_matrix(ss, node_mask);
         ss << "\nMatrix\n" << p2p_dependency_graph << std::endl;
-        /* TODO -S- enaled it later
-        if (!ally_process_topology.build_all(ss,
-                                             comm_addr,
-                                             thread_group_ctx->get_thread_group_device_indices(),
-                                             p2p_dependency_graph)) {
-            LOG_ERROR(ss.str(), "\nCannot build ipc ring! Abort. Build Log:\n", ss.str());
+
+        if (!ally_process_topology.build_all(
+                ss, thread_group_ctx->get_thread_group_device_indices(), p2p_dependency_graph)) {
+            LOG_ERROR(
+                ss.str(), "\nCannot build cluster global ring! Abort. Build Log:\n", ss.str());
             abort();
         }
-*/
-        if (!ally_process_topology.build_all(
-                ss, thread_group_ctx->get_thread_group_device_indices(), p2p_dependency_graph))
 
-            LOG_DEBUG("Build IPC ring succesfully. Log:\n", ss.str());
+        LOG_DEBUG("Build cluster global ring successfully. Log:\n", ss.str());
     }
 
     {
@@ -183,6 +179,9 @@ bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_
     LOG_INFO("initialize IPC context");
     get_ipc_ctx().initialize_ctx(ccl_communicator);
 
+    LOG_INFO("initialize SCALE-OUT context");
+    get_scaleout_ctx().initialize_ctx(ccl_communicator);
+
     // dump topology
     std::stringstream out;
     dump_process_topologies(out);
diff --git a/src/common/comm/l0/context/process_group_ctx.hpp b/src/common/comm/l0/context/process_group_ctx.hpp
index cbe563040..15791f7b2 100644
--- a/src/common/comm/l0/context/process_group_ctx.hpp
+++ b/src/common/comm/l0/context/process_group_ctx.hpp
@@ -15,12 +15,12 @@
 */
 #pragma once
 #include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
+#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp"
+#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp"
 
-#include "common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp"
+#include "common/comm/l0/context/scale/scaling_context_dispatcher.hpp"
 #include "common/comm/l0/topology/topology_declarations.hpp"
 namespace ccl {
 class host_communicator;
diff --git a/src/common/comm/l0/context/scale/base/base_session.cpp b/src/common/comm/l0/context/scale/base/base_session.cpp
new file mode 100644
index 000000000..7e26961bc
--- /dev/null
+++ b/src/common/comm/l0/context/scale/base/base_session.cpp
@@ -0,0 +1,101 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
+#include "oneapi/ccl/native_device_api/l0/primitives.hpp"
+#include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp"
+
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+
+namespace native {
+namespace observer {
+
+void context_descr::init_host_dev_fields() {
+    host_mem_producer = nullptr;
+    host_mem_producer_counter = nullptr;
+    host_consumed_bytes = 0;
+    host_expected_bytes = 0;
+
+    dev_mem_consumer = nullptr;
+    dev_mem_consumer_counter = nullptr;
+    device_produced_bytes = 0;
+}
+
+void context_descr::init(size_t staged_buffer_elem_count,
+                         size_t observer_domain_index,
+                         size_t observer_domain_count,
+                         std::shared_ptr<ccl_context>& context,
+                         ccl_device& device) {
+    // set all fields by 0
+    init_host_dev_fields();
+
+    /* HOST */
+    // create staged mem in host context (Host memory allocation descriptor)
+    ze_host_mem_alloc_desc_t host_descr = ccl_context::get_default_host_alloc_desc();
+    host_descr.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED;
+
+    // host mem buf
+    host_mem_producer = context->template alloc_memory<uint8_t>(
+        staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype()),
+        /*TODO use page size*/ ccl::get_datatype_size(kernel_params.get_datatype()),
+        host_descr);
+
+    // create staged mem counter in host context (host mem buf counter)
+    host_mem_producer_counter = context->template alloc_memory<counter_t>(
+        1, /*TODO use page size*/ sizeof(counter_t), host_descr);
+
+    host_expected_bytes =
+        staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype());
+
+    /* DEVICE */
+    ze_device_mem_alloc_desc_t mem_descr = ccl_device::get_default_mem_alloc_desc();
+
+    // create total aggregated memory in device context
+    mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED;
+    dev_mem_consumer = device.template alloc_memory_ptr<uint8_t>(
+        (staged_buffer_elem_count * observer_domain_count) *
+            ccl::get_datatype_size(kernel_params.get_datatype()),
+        ccl::get_datatype_size(kernel_params.get_datatype()),
+        context,
+        mem_descr);
+
+    // create offset in device context
+    mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
+    producer_aggregated_memory_offset =
+        device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr);
+
+    // create aggregated counter in device context
+    dev_mem_consumer_counter =
+        device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr);
+
+    /* COUNTERS */
+    reset_counters(observer_domain_index, observer_domain_count);
+}
+
+void context_descr::reset_counters(size_t observer_domain_index, size_t observer_domain_count) {
+    counter_t filled_counter_value = 0;
+
+    host_mem_producer_counter->enqueue_write_sync(&filled_counter_value, 1);
+
+    filled_counter_value = observer_domain_index * host_mem_producer->count();
+
+    producer_aggregated_memory_offset->enqueue_write_sync(&filled_counter_value, 1);
+
+    filled_counter_value = 0;
+    dev_mem_consumer_counter->enqueue_write_sync(&filled_counter_value, 1);
+}
+
+} // namespace observer
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/base/base_session.hpp b/src/common/comm/l0/context/scale/base/base_session.hpp
new file mode 100644
index 000000000..fea9590b7
--- /dev/null
+++ b/src/common/comm/l0/context/scale/base/base_session.hpp
@@ -0,0 +1,164 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "oneapi/ccl.hpp"
+#include "oneapi/ccl/native_device_api/l0/device.hpp"
+#include "oneapi/ccl/native_device_api/l0/context.hpp"
+
+#include "coll/algorithms/algorithms_enum.hpp"
+#include "common/comm/l0/modules/supported_modules.hpp"
+#include "coll/coll_param.hpp"
+
+namespace native {
+namespace observer {
+using counter_t = uint64_t;
+
+struct producer_description {
+    size_t rank;
+    size_t comm_size;
+    counter_t staged_buffer_elem_count;
+
+    std::shared_ptr<ccl_context> context;
+    ccl_device& device;
+    ccl_device::device_cmd_list immediate_list; //TODO make persisten
+};
+
+struct context_descr {
+    context_descr(const coll_param_gpu& kernel_params) : kernel_params(kernel_params) {}
+
+    using host_mem_ptr_t = ccl_context::host_memory_ptr<uint8_t>;
+    using host_mem_ptr_cntr_t = ccl_context::host_memory_ptr<counter_t>;
+    using dev_mem_ptr_t = ccl_device::device_memory_ptr<uint8_t>;
+    using dev_mem_ptr_cntr_t = ccl_device::device_memory_ptr<counter_t>;
+
+    // produced by kernel
+    host_mem_ptr_t host_mem_producer;
+    host_mem_ptr_cntr_t host_mem_producer_counter;
+    size_t host_consumed_bytes;
+    size_t host_expected_bytes;
+
+    // consumed by kernel
+    dev_mem_ptr_t dev_mem_consumer;
+    dev_mem_ptr_cntr_t dev_mem_consumer_counter;
+    size_t device_produced_bytes;
+
+    // (TODO consider using 'recv_buff' from collective entry)
+    // to reduce copy iterations
+    // TODO: rename
+    dev_mem_ptr_cntr_t producer_aggregated_memory_offset;
+
+    void init_host_dev_fields();
+
+    void init(size_t staged_buffer_elem_count,
+              size_t observer_domain_index,
+              size_t observer_domain_count,
+              std::shared_ptr<ccl_context>& context,
+              ccl_device& device);
+
+    void reset_counters(size_t observer_domain_index, size_t observer_domain_count);
+
+private:
+    // TODO: can we guarantee that this object is not destroyed before invoke_params and
+    // use const& here?
+    coll_param_gpu kernel_params;
+};
+
+template <ccl_coll_type coll_type>
+struct invoke_params {
+    static constexpr ccl_coll_type get_coll_type() {
+        return coll_type;
+    }
+
+    invoke_params(producer_description&& in_producer_params, const coll_param_gpu& kernel_params)
+            : in_params(std::move(in_producer_params)),
+              kernel_params(kernel_params),
+              out_params(kernel_params),
+              valid(false) {}
+
+    void set_out_params(const context_descr& src) {
+        out_params = src;
+        valid = true;
+    }
+
+    bool is_valid() const {
+        return valid;
+    }
+
+    const producer_description& get_producer_params() const {
+        return in_params;
+    }
+
+    producer_description& get_producer_params() {
+        return in_params;
+    }
+
+    const coll_param_gpu& get_kernel_params() const {
+        return kernel_params;
+    }
+
+    const context_descr& get_ctx_params() const {
+        if (!is_valid()) {
+            throw std::runtime_error("observer invocation params are not ready");
+        }
+        return out_params;
+    }
+
+private:
+    producer_description in_params;
+    // TODO: can we guarantee that this object is not destroyed before l0 entry and
+    // use const& here?
+    coll_param_gpu kernel_params;
+    context_descr out_params;
+    bool valid;
+};
+
+struct session_key {
+    using hash_core_t = size_t;
+
+    friend std::ostream& operator<<(std::ostream& out, const session_key& key) {
+        out << key.to_string();
+        return out;
+    }
+
+    template <class T>
+    session_key(const T* src) : hash(std::hash<const T*>{}(src)) {}
+
+    bool operator<(const session_key& other) const noexcept {
+        return hash < other.hash;
+    }
+
+    std::string to_string() const {
+        return std::to_string(hash);
+    }
+
+private:
+    hash_core_t hash;
+};
+
+struct session_notification {
+    session_notification(void* addr, size_t size_bytes)
+            : host_src_ptr(addr),
+              src_size_bytes(size_bytes) {}
+    void* host_src_ptr;
+    size_t src_size_bytes;
+};
+
+} // namespace observer
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/base/base_session_table.hpp b/src/common/comm/l0/context/scale/base/base_session_table.hpp
new file mode 100644
index 000000000..574127381
--- /dev/null
+++ b/src/common/comm/l0/context/scale/base/base_session_table.hpp
@@ -0,0 +1,76 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include <atomic>
+#include <map>
+#include <memory>
+
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+#include "common/comm/l0/modules/supported_modules.hpp"
+
+namespace native {
+namespace observer {
+
+// session owner, not thread-safe
+template <class session_interface>
+struct session_table {
+    using session_key_t = session_key;
+    using session_interface_t = session_interface;
+    using session_interface_ptr_t = std::shared_ptr<session_interface_t>;
+
+    template <template <ccl::device_topology_type, class...> class specific_session,
+              ccl::device_topology_type class_id,
+              class invoke_params_type>
+    session_interface_ptr_t create_session(const session_key_t& key,
+                                           invoke_params_type& params,
+                                           size_t observer_domain_index,
+                                           size_t observer_domain_count) {
+        using specific_session_impl = specific_session<class_id, invoke_params_type>;
+
+        static_assert(std::is_base_of<session_interface_t, specific_session_impl>::value,
+                      "Relationship IS-A `specific_session` for `session_interface_t` failed");
+
+        auto sess = std::make_shared<specific_session_impl>(params.get_producer_params(),
+                                                            params.get_kernel_params(),
+                                                            observer_domain_index,
+                                                            observer_domain_count,
+                                                            key);
+
+        params.set_out_params(sess->get_ctx_descr());
+        sessions.emplace(key, sess);
+
+        return sess;
+    }
+
+    size_t get_unique_tag() {
+        static std::atomic<size_t> tag_counter{ 1 };
+        return tag_counter.fetch_add(1);
+    }
+
+    std::string to_string() const {
+        std::stringstream ss;
+        ss << "sessions count: " << sessions.size() << std::endl;
+        for (const auto& val : sessions) {
+            ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n"
+               << val.second->to_string() << std::endl;
+        }
+        return ss.str();
+    }
+
+    std::map<session_key_t, session_interface_ptr_t> sessions{};
+};
+} // namespace observer
+} //namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp
similarity index 90%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp
index 3307840a9..fcee3dee6 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx.hpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp
@@ -22,8 +22,8 @@
 #include <thread>
 #include <vector>
 #include "common/comm/l0/context/base_scaling_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
 
 namespace ccl {
 class host_communicator;
@@ -163,14 +163,20 @@ class ipc_ctx : public observer::base_scaling_ctx<ipc_ctx<Impl, types...>,
             abort();
         }
 
+        // TODO: WA: destroy all sessions that were before
+        // (only one session is always active)
+        // without this WA, we hang in kernels when reusing sessions
+        // because other sessions have the same key accidentally.
+        // It will works for GPU cache enabled but invalid without cache
+        table->sessions.clear();
+
         std::shared_ptr<session> sess;
-        LOG_DEBUG("session_key: ",
-                  session_key.to_string(),
-                  ", current sessions count: ",
-                  table->sessions.size());
         auto session_it = table->sessions.find(session_key);
         if (session_it == table->sessions.end()) {
-            //create new session
+            LOG_DEBUG("create new session session_key: ",
+                      session_key.to_string(),
+                      ", current sessions count: ",
+                      table->sessions.size());
             const auto& comm_addr =
                 observer_ptr->template get_comm_data<ccl::group_split_type::cluster,
                                                      ccl::device_topology_type::ring>();
@@ -184,6 +190,10 @@ class ipc_ctx : public observer::base_scaling_ctx<ipc_ctx<Impl, types...>,
         else {
             //renew existing
             sess = session_it->second;
+            LOG_DEBUG("session reuse: session_key: ",
+                      session_key.to_string(),
+                      ", current sessions count: ",
+                      table->sessions.size());
         }
 
         append_session_for_processing(session_key, sess);
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp
similarity index 96%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp
index 673b1b975..35fdafe5d 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp"
 #include "common/utils/tuple.hpp"
 
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
 #include "common/log/log.hpp"
 #include "common/comm/host_communicator/host_communicator.hpp"
 #include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
@@ -238,16 +238,19 @@ void ipc_ctx<TEMPLATE_DEF_ARG>::listener(ccl_ipc_gpu_comm* listener_device) {
         {
             std::unique_lock<std::mutex> lk(delivery_mutex);
             delivery_condition.wait(lk, [this]() {
-                return !processing_queue.empty();
+                return !processing_queue.empty() || stop.load();
             });
 
             sessions_to_execute.splice(sessions_to_execute.end(), processing_queue);
         }
 
-        LOG_DEBUG("Sessions for processing: ", sessions_to_execute.size());
+        LOG_DEBUG("Sessions for processing: ",
+                  sessions_to_execute.size(),
+                  " stop flag status: ",
+                  stop.load());
         for (auto sess_it = sessions_to_execute.begin();
              sess_it != sessions_to_execute.end() and !stop.load();) {
-            shared_session_ptr sess = *sess_it;
+            shared_session_ptr_t sess = *sess_it;
 
             // try restore IPC handles
             LOG_DEBUG("process session: ", sess->to_string());
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
similarity index 97%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
index caf56e411..baa3b0539 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.cpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #include <sstream>
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
 #include "common/log/log.hpp"
 #include "common/comm/host_communicator/host_communicator.hpp"
 
@@ -118,6 +118,7 @@ bool session::process(const ccl_ipc_gpu_comm* indexed_ipc_dst_devices,
             LOG_ERROR("Cannot recover IPC handle by index: ", num_handles, ", error:\n", ex.what());
             throw;
         }
+        num_handles++;
     }
 
     // handles received
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp
similarity index 68%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp
index cf0f98445..73e015f8b 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_session.hpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp
@@ -17,8 +17,9 @@
 #include <atomic>
 #include <map>
 #include <memory>
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp"
+#include "coll/coll_param.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
 #include "common/comm/l0/modules/supported_modules.hpp"
 
 namespace ccl {
@@ -77,16 +78,18 @@ class session {
     std::atomic<bool> finished;
 };
 
-using shared_session_ptr = std::shared_ptr<session>;
+using shared_session_ptr_t = std::shared_ptr<session>;
 
 /* High level session
  * Contains collective communication data
  */
-template <ccl_coll_type coll_type, class kernel_params, ccl::device_topology_type class_id>
+template <ccl_coll_type coll_type, ccl::device_topology_type class_id>
 struct typed_ipc_session : public session {
     typed_ipc_session(origin_ipc_memory_container&& ipc_src_memory_handles,
-                      size_t source_ipc_device_rank)
-            : session(std::move(ipc_src_memory_handles), source_ipc_device_rank) {}
+                      size_t source_ipc_device_rank,
+                      const coll_param_gpu& kernel_params)
+            : session(std::move(ipc_src_memory_handles), source_ipc_device_rank),
+              kernel_params(kernel_params) {}
 
     void visit(const ccl_ipc_gpu_comm* source,
                native::supported_device_modules<ipc_dst_device_coll_module>& ipc_modules) override {
@@ -99,9 +102,8 @@ struct typed_ipc_session : public session {
         assert(module_ptr);
 
         // get appropriate kernel
-        auto& kernel = module_ptr->template get_class<typename module_t::main_class>()
-                           .template get<kernel_params>();
-        using kernel_t = typename std::decay<decltype(kernel)>::type;
+        auto& kernel =
+            module_ptr->template get_class<typename module_t::main_class>().get(kernel_params);
 
         // get recovered ipc handles
         auto data_it = data_to_recover.ipc_memory_storage.find(source);
@@ -110,23 +112,11 @@ struct typed_ipc_session : public session {
         }
 
         // bind data
-        const recovered_handles_storage::restored_ipc_memory_container& ipc_handles =
-            data_it->second;
-        typename kernel_t::tmp_recv_buf_arg_type tmp_recv_buf =
-            reinterpret_cast<typename kernel_t::tmp_recv_buf_arg_type>(
-                ipc_handles.at(0).get().pointer);
-        kernel.template set_arg<typename kernel_t::tmp_recv_buf_arg>(tmp_recv_buf);
-
-        typename kernel_t::income_data_flag_arg_type inc =
-            reinterpret_cast<typename kernel_t::income_data_flag_arg_type>(
-                ipc_handles.at(1).get().pointer);
-        kernel.template set_arg<typename kernel_t::income_data_flag_arg>(inc);
-
-        typename kernel_t::ready_to_recv_flag_arg_type ready =
-            reinterpret_cast<typename kernel_t::ready_to_recv_flag_arg_type>(
-                ipc_handles.at(2).get().pointer);
-        kernel.template set_arg<typename kernel_t::ready_to_recv_flag_arg>(ready);
+        const auto& ipc_handles = data_it->second;
+        kernel.bind_data(ipc_handles);
     }
+
+    coll_param_gpu kernel_params;
 };
 
 // session owner
@@ -140,11 +130,10 @@ struct session_table {
                                             const std::string& peer_addr,
                                             ipc_invoke_params_type&& params,
                                             size_t source_device_rank) {
-        using specific_session = typed_ipc_session<ipc_invoke_params_type::get_coll_type(),
-                                                   typename ipc_invoke_params_type::kernel_params_t,
-                                                   class_id>;
-        auto sess =
-            std::make_shared<specific_session>(std::move(params.handles), source_device_rank);
+        using specific_session =
+            typed_ipc_session<ipc_invoke_params_type::get_coll_type(), class_id>;
+        auto sess = std::make_shared<specific_session>(
+            std::move(params.handles), source_device_rank, params.get_kernel_params());
         sessions.emplace(key, sess);
 
         start_session(sess, client, peer_addr);
@@ -152,7 +141,7 @@ struct session_table {
     }
 
     std::string to_string() const;
-    std::map<session_key_t, shared_session_ptr> sessions{};
+    std::map<session_key_t, shared_session_ptr_t> sessions{};
 
     static size_t get_unique_tag();
 
@@ -162,5 +151,4 @@ struct session_table {
                        const std::string& peer_addr);
 };
 
-using shared_session_table_ptr = std::shared_ptr<session_table>;
 } // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
similarity index 96%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
index 2699e6b21..0233e7faf 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.cpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/log/log.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
 #include "common/comm/l0/devices/devices_declaration.hpp"
 
 namespace native {
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp
similarity index 100%
rename from src/common/comm/l0/context/scaling_ctx/ipc_ctx_utils.hpp
rename to src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp
similarity index 92%
rename from src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp
rename to src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp
index d05cd3dd0..6acabdfa8 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.cpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
 
 namespace native {
 
diff --git a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp
similarity index 78%
rename from src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp
rename to src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp
index 831558bcf..42e36548e 100644
--- a/src/common/comm/l0/context/scaling_ctx/ipc_session_key.hpp
+++ b/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp
@@ -20,21 +20,29 @@
 
 #include "oneapi/ccl/native_device_api/l0/device.hpp"
 #include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/coll_param.hpp"
 
 namespace native {
 
-template <ccl_coll_type type, class kernel_params>
+template <ccl_coll_type type>
 struct ipc_invoke_params {
-    using kernel_params_t = kernel_params;
-
-    ipc_invoke_params(std::vector<ccl_device::device_ipc_memory_handle>&& h)
-            : handles(std::move(h)) {}
+    ipc_invoke_params(std::vector<ccl_device::device_ipc_memory_handle>&& h,
+                      const coll_param_gpu& params)
+            : handles(std::move(h)),
+              params{ params } {}
 
     static constexpr ccl_coll_type get_coll_type() {
         return type;
     }
 
+    const coll_param_gpu& get_kernel_params() const {
+        return params;
+    }
+
     std::vector<ccl_device::device_ipc_memory_handle> handles;
+    // TODO: can we guarantee that this object is not destroyed before l0 entry and
+    // use const& here?
+    coll_param_gpu params;
 };
 
 struct ipc_session_key {
diff --git a/src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx.hpp
similarity index 74%
rename from src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp
rename to src/common/comm/l0/context/scale/numa/numa_ctx.hpp
index 745f04a6c..7a1b30ae0 100644
--- a/src/common/comm/l0/context/scaling_ctx/numa_ctx.hpp
+++ b/src/common/comm/l0/context/scale/numa/numa_ctx.hpp
@@ -15,8 +15,9 @@
 */
 #pragma once
 #include "common/comm/l0/context/base_scaling_ctx.hpp"
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
-#include "common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+#include "common/comm/l0/context/scale/base/base_session_table.hpp"
+#include "common/comm/l0/context/scale/numa/numa_session.hpp"
 
 namespace native {
 
@@ -26,10 +27,12 @@ class ccl_virtual_gpu_comm;
 template <class device>
 class ccl_numa_proxy;
 
+#define NUMA_CTX_DEVICE_PROXY_TYPES(observer_type) \
+    observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm>
+
 template <class Impl, ccl::device_topology_type... types>
 class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
-                                                   ccl_numa_proxy<ccl_gpu_comm>,
-                                                   ccl_numa_proxy<ccl_virtual_gpu_comm>> {
+                                                   NUMA_CTX_DEVICE_PROXY_TYPES(ccl_numa_proxy)> {
 public:
     static_assert(sizeof...(types), "types must be not 0");
     using context_impl = Impl;
@@ -38,11 +41,14 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
     using observer_t = ccl_numa_proxy<device_t>;
 
     using scaling_ctx_base_t = observer::base_scaling_ctx<numa_ctx<Impl, types...>,
-                                                          observer_t<ccl_gpu_comm>,
-                                                          observer_t<ccl_virtual_gpu_comm>>;
+                                                          NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>;
+
+    using session_t = observer::numa_session_iface; //TODO: numa_session
+    using session_ptr_t = std::shared_ptr<session_t>;
+    using base_session_table_t = observer::session_table<session_t>;
+    using base_session_table_ptr_t = std::shared_ptr<base_session_table_t>;
 
-    using numa_actor = observer::subscribed_actor<std::shared_ptr<observer::session>,
-                                                  observer::session_notification>;
+    using numa_actor = observer::subscribed_actor<session_ptr_t, observer::session_notification>;
 
     using observable_scale_up_topologies =
         typename scaling_ctx_base_t::template observable_topologies<types...>;
@@ -56,25 +62,25 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
     // session data
     template <class NUMA_source_device_t, ccl_coll_type coll_type>
     struct device_session_data {
-        std::map<NUMA_source_device_t*, std::shared_ptr<observer::session_table>> source_sessions;
+        std::map<NUMA_source_device_t*, base_session_table_ptr_t> source_sessions;
     };
 
     //TODO make table PER thread!!!
-    template <ccl_coll_type coll_type>
-    using session_table_t =
-        std::tuple<device_session_data<observer_t<ccl_gpu_comm>, coll_type>,
-                   device_session_data<observer_t<ccl_virtual_gpu_comm>, coll_type>>;
+    template <ccl_coll_type coll_type, class... devices_types>
+    using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>;
 
     template <ccl_coll_type... coll_type>
-    using session_table_typed_storage_t = std::tuple<session_table_t<coll_type>...>;
+    using session_table_typed_storage_t =
+        std::tuple<session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>...>;
 
     struct session_table_initializer {
         template <ccl_coll_type coll_type, class device_t>
-        void operator()(session_table_t<coll_type>& table, observer_t<device_t>* observer_ptr) {
+        void operator()(session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>& table,
+                        observer_t<device_t>* observer_ptr) {
             auto& sessions_table =
                 ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table);
             sessions_table.source_sessions.emplace(
-                observer_ptr, std::make_shared<observer::session_table>(observer::session_table{}));
+                observer_ptr, std::make_shared<base_session_table_t>(base_session_table_t{}));
         }
     };
 
@@ -112,6 +118,11 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
         //Try to find existing session owner for coll type
         auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(
             std::get<coll_type>(collective_sessions));
+
+        // In general way sessions_table.source_sessions.find(observer_ptr) has multithreading access,
+        // But it has write access only in wire up-phase, when observers are inserted from topology construction
+        // Multithreading access here is served by model "multiple-readers - no writers"
+        // and can be used without mutex protection
         auto session_table_it = sessions_table.source_sessions.find(observer_ptr);
         if (session_table_it == sessions_table.source_sessions.end()) {
             std::stringstream ss;
@@ -128,13 +139,13 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
             abort();
         }
 
-        std::shared_ptr<observer::session_table> table = session_table_it->second;
+        base_session_table_ptr_t table = session_table_it->second;
         if (!table) {
             LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort");
             abort();
         }
 
-        std::shared_ptr<observer::session> sess;
+        session_ptr_t sess;
         LOG_DEBUG("session_key: ",
                   sess_key.to_string(),
                   ", current sessions count: ",
@@ -142,7 +153,7 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
         auto session_it = table->sessions.find(sess_key);
         if (session_it == table->sessions.end()) {
             //create new session
-            sess = table->create_session<class_id>(
+            sess = table->template create_session<observer::numa_session, class_id>(
                 sess_key, param, registered_index, registered_devices_count);
         }
         else {
@@ -175,10 +186,9 @@ class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
     template <ccl::device_topology_type topology_type, class device_t>
     void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
 
-    using devices_tuple_thread_map =
-        std::tuple<observer::device_thread_map<observer_t<ccl_gpu_comm>, numa_actor>,
-                   observer::device_thread_map<observer_t<ccl_virtual_gpu_comm>, numa_actor>>;
-    devices_tuple_thread_map numa_workers;
+    using specific_device_tuple_thread_map_t =
+        observer::multiple_device_thread_map_t<numa_actor, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>;
+    specific_device_tuple_thread_map_t numa_workers;
 
     template <class device_t>
     void worker(observer_t<device_t>* device,
diff --git a/src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp
similarity index 94%
rename from src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp
rename to src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp
index d3c6fa68b..2e3d3021e 100644
--- a/src/common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp
+++ b/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #pragma once
-#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
 #include "common/utils/tuple.hpp"
 #include "common/log/log.hpp"
 
@@ -107,10 +107,10 @@ void numa_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device,
         size_t partial_chunk_size = 0;
 
         // get own device partial chunk data
-        if ((*sess_it)->produce_data(&partial_chunk, partial_chunk_size)) {
+        (*sess_it)->produce_data(&partial_chunk, partial_chunk_size);
+        if (partial_chunk_size > 0) {
             // notify other actor for data_ready
-            observer::detail::actor_publisher<std::shared_ptr<observer::session>,
-                                              observer::session_notification>
+            observer::detail::actor_publisher<session_ptr_t, observer::session_notification>
                 visitor;
             ccl_tuple_for_each_args(numa_workers,
                                     visitor,
@@ -130,8 +130,9 @@ void numa_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device,
                 actor_index % total_actors_count, (*sess_it)->get_send_tag(), messages);
 
             for (auto mess_it = messages.begin(); mess_it != messages.end(); ++mess_it) {
-                session_finished = (*sess_it)->consume_data(
+                (*sess_it)->consume_data(
                     0 /*TODO !!!! */, mess_it->host_src_ptr, mess_it->src_size_bytes);
+                session_finished = (*sess_it)->is_consumed();
                 assert(not(session_finished && std::next(mess_it, 1) != messages.end()) &&
                        "Session are filled too early");
             }
diff --git a/src/common/comm/l0/context/scale/numa/numa_session.hpp b/src/common/comm/l0/context/scale/numa/numa_session.hpp
new file mode 100644
index 000000000..d7f6f799a
--- /dev/null
+++ b/src/common/comm/l0/context/scale/numa/numa_session.hpp
@@ -0,0 +1,187 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+
+namespace native {
+namespace observer {
+
+class numa_session_iface {
+public:
+    numa_session_iface(session_key key) : sess_key(key) {}
+    virtual ~numa_session_iface() = default;
+
+    size_t get_send_tag() const {
+        return send_tag;
+    }
+
+    const session_key& get_session_key() const {
+        return sess_key;
+    }
+
+    std::string to_string() const {
+        std::stringstream ss;
+        ss << "session key identifier: " << get_session_key();
+        return ss.str();
+    }
+
+    virtual void prepare(size_t observer_domain_index,
+                         size_t observer_domain_count,
+                         void* type_erased_param) = 0;
+
+    virtual void produce_data(void** out_chunk, size_t& out_chunk_size) = 0;
+    virtual void consume_data(size_t observer_domain_index,
+                              void* in_chunk,
+                              size_t in_chunk_size) = 0;
+    virtual bool is_consumed() noexcept = 0;
+    virtual bool is_produced() noexcept = 0;
+
+private:
+    size_t send_tag{};
+    session_key sess_key;
+};
+
+/* High level session
+ * Contains collective communication data
+ */
+template <ccl::device_topology_type class_id, class session_invoke_params>
+struct numa_session : public numa_session_iface {
+    using invoke_params_t = session_invoke_params;
+    using session_key_t = session_key;
+
+    numa_session(producer_description& in_param,
+                 const coll_param_gpu& kernel_params,
+                 size_t observer_domain_index,
+                 size_t observer_domain_count,
+                 const session_key_t& key)
+            : numa_session_iface(key),
+              kernel_params(kernel_params),
+              ctx_descr(kernel_params),
+              copy_immediate_list(std::move(in_param.immediate_list)) {
+        ctx_descr.init(in_param.staged_buffer_elem_count,
+                       observer_domain_index,
+                       observer_domain_count,
+                       in_param.context,
+                       in_param.device);
+    }
+
+    context_descr& get_ctx_descr() {
+        return ctx_descr;
+    }
+
+    const coll_param_gpu& get_kernel_params() const {
+        return kernel_params;
+    }
+
+    void prepare(size_t observer_domain_index,
+                 size_t observer_domain_count,
+                 void* type_erased_param) override {
+        auto* out_param = static_cast<invoke_params_t*>(type_erased_param);
+        ctx_descr.reset_counters(observer_domain_index, observer_domain_count);
+
+        out_param->set_out_params(ctx_descr);
+    }
+
+    void produce_data(void** out_chunk, size_t& out_chunk_size) override {
+        size_t old_consumed = get_ctx_descr().host_consumed_bytes;
+        uint64_t total_produced = *get_ctx_descr().host_mem_producer_counter->get();
+
+        size_t to_consume = total_produced - old_consumed;
+        if (to_consume) {
+            //fence
+            LOG_TRACE(to_string(),
+                      " - bytes produced: ",
+                      total_produced,
+                      ", previously bytes consumed: ",
+                      old_consumed);
+            std::atomic_thread_fence(std::memory_order::memory_order_seq_cst); // TODO: why?
+
+            // do not read data here!
+            *out_chunk =
+                static_cast<void*>(get_ctx_descr().host_mem_producer->get() + old_consumed);
+
+            // update host_consumed_bytes
+            get_ctx_descr().host_consumed_bytes += to_consume;
+        }
+
+        // TODO: set logging here
+        out_chunk_size = to_consume;
+    }
+
+    void consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size) override {
+        /* TODO create event
+         * ze_event_handle_t mem_event {};
+         */
+
+        auto device_consumer_ready_bytes = get_ctx_descr().dev_mem_consumer_counter->get();
+        auto device_produced_bytes = get_ctx_descr().device_produced_bytes;
+
+        // TODO: set logging here
+
+        // copy buffer from host to device
+        ze_result_t res = zeCommandListAppendMemoryCopy(
+            copy_immediate_list.get(),
+            static_cast<void*>(get_ctx_descr().dev_mem_consumer->get() + device_produced_bytes),
+            in_chunk,
+            in_chunk_size,
+            /*mem_event*/ nullptr,
+            0,
+            nullptr);
+        if (res != ZE_RESULT_SUCCESS) {
+            throw std::runtime_error(
+                std::string(
+                    "cannot append copy NUMA host to device memory for partial result, error: ") +
+                native::to_string(res));
+        }
+        device_produced_bytes += in_chunk_size;
+        get_ctx_descr().device_produced_bytes = device_produced_bytes;
+
+        // TODO: set logging here
+        // copy size from host to device
+        res = zeCommandListAppendMemoryCopy(copy_immediate_list.get(),
+                                            device_consumer_ready_bytes,
+                                            &device_produced_bytes,
+                                            sizeof(device_produced_bytes),
+                                            nullptr,
+                                            0,
+                                            /*&mem_event*/ nullptr);
+        if (res != ZE_RESULT_SUCCESS) {
+            throw std::runtime_error(
+                std::string(
+                    "cannot append copy NUMA host to device memory for ready bytes, error: ") +
+                native::to_string(res));
+        }
+    }
+
+    bool is_consumed() noexcept override {
+        return (get_ctx_descr().device_produced_bytes *
+                ccl::get_datatype_size(get_kernel_params().get_datatype())) ==
+               get_ctx_descr().host_consumed_bytes;
+    }
+
+    bool is_produced() noexcept override {
+        return get_ctx_descr().host_expected_bytes == get_ctx_descr().host_consumed_bytes;
+    }
+
+private:
+    coll_param_gpu kernel_params;
+    context_descr ctx_descr;
+    ccl_device::device_cmd_list copy_immediate_list;
+};
+
+} // namespace observer
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp
new file mode 100644
index 000000000..7d1302c33
--- /dev/null
+++ b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp
@@ -0,0 +1,214 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "common/comm/l0/context/base_scaling_ctx.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp"
+#include "common/comm/l0/context/scale/base/base_session_table.hpp"
+
+namespace ccl {
+class host_communicator;
+}
+
+namespace native {
+
+class ccl_gpu_comm;
+class ccl_virtual_gpu_comm;
+
+template <class device>
+class ccl_scaleout_proxy;
+
+template <class device>
+class ccl_gpu_scaleup_proxy;
+
+template <class device>
+class ccl_numa_proxy;
+
+#define SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_type) \
+    observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm>, \
+        observer_type<ccl_numa_proxy<ccl_gpu_comm>>, \
+        observer_type<ccl_numa_proxy<ccl_virtual_gpu_comm>>, \
+        observer_type<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, \
+        observer_type<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, \
+        observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, \
+        observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>
+
+template <class Impl, ccl::device_topology_type... types>
+class scale_out_ctx
+        : public observer::base_scaling_ctx<scale_out_ctx<Impl, types...>,
+                                            SCALE_OUT_CTX_DEVICE_PROXY_TYPES(ccl_scaleout_proxy)> {
+public:
+    using context_impl = Impl;
+
+    template <class device_t>
+    using observer_t = ccl_scaleout_proxy<device_t>;
+
+    using scaling_ctx_base_t =
+        observer::base_scaling_ctx<scale_out_ctx<Impl, types...>,
+                                   SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>;
+
+    using session_t = observer::scale_out_session_iface;
+    using session_ptr_t = std::shared_ptr<session_t>;
+    using spec_session_table_t = observer::session_table<session_t>;
+    using spec_session_table_ptr_t = std::shared_ptr<spec_session_table_t>;
+
+    using scaleout_actor = observer::actor<session_ptr_t>;
+
+    using observable_scale_up_topologies =
+        typename scaling_ctx_base_t::template observable_topologies<types...>;
+    using indexed_observable_topologies =
+        typename scaling_ctx_base_t::template indexed_observable_topologies<types...>;
+
+    observable_scale_up_topologies observables;
+    indexed_observable_topologies indexed_observables;
+
+    // session data
+    template <class scaleout_source_device_t, ccl_coll_type coll_type>
+    struct device_session_data {
+        std::map<scaleout_source_device_t*, spec_session_table_ptr_t> source_sessions;
+    };
+
+    //TODO make table PER thread!!!
+    template <ccl_coll_type coll_type, class... devices_types>
+    using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>;
+
+    template <ccl_coll_type... coll_type>
+    using session_table_typed_storage_t =
+        std::tuple<session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>...>;
+
+    struct session_table_initializer {
+        template <ccl_coll_type coll_type, class device_t>
+        void operator()(
+            session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>& table,
+            observer_t<device_t>* observer_ptr) {
+            auto& sessions_table =
+                ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table);
+            sessions_table.source_sessions.emplace(
+                observer_ptr, std::make_shared<spec_session_table_t>(spec_session_table_t{}));
+        }
+    };
+
+    session_table_typed_storage_t<CCL_COLL_LIST> collective_sessions;
+
+    void initialize_ctx(std::shared_ptr<ccl::host_communicator> communicator);
+
+    //observer subject interface implementations
+    template <class device_t, ccl::device_topology_type topology_type>
+    void attach_ctx_observer(size_t rank_addr,
+                             observer_t<device_t>* observer_ptr,
+                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
+        register_observer_impl<topology_type>(rank_addr, observer_ptr);
+    }
+
+    template <class device_t, ccl::device_topology_type class_id, class invoke_params_t>
+    void invoke_ctx_observer(observer_t<device_t>* observer_ptr,
+                             std::integral_constant<ccl::device_topology_type, class_id> val,
+                             const observer::session_key& sess_key,
+                             invoke_params_t& param) {
+        // sanity - check registered proxy
+        observer::container_t<observer_t<device_t>>& container =
+            scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
+                observables);
+
+        auto it = container.find(observer_ptr);
+        if (it == container.end()) {
+            throw std::runtime_error(std::string("ScaleOut Observer is not registered: ") +
+                                     observer_ptr->to_string() +
+                                     " total count: " + std::to_string(container.size()));
+        }
+        size_t registered_index = std::distance(container.begin(), it);
+
+        static constexpr ccl_coll_type coll_type = invoke_params_t::get_coll_type();
+        //Try to find existing session owner for coll type
+        auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(
+            std::get<coll_type>(collective_sessions));
+        auto session_table_it = sessions_table.source_sessions.find(observer_ptr);
+        if (session_table_it == sessions_table.source_sessions.end()) {
+            std::stringstream ss;
+            ss << "sessions count: " << sessions_table.source_sessions.size() << std::endl;
+            for (const auto& val : sessions_table.source_sessions) {
+                ss << val.first->to_string() << ", " << val.second->to_string() << std::endl;
+            }
+            LOG_ERROR("session_key: ",
+                      sess_key.to_string(),
+                      ", cannot find source session for device: ",
+                      observer_ptr->to_string(),
+                      ". Available keys: ",
+                      ss.str());
+            abort();
+        }
+
+        auto table = session_table_it->second;
+        if (!table) {
+            LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort");
+            abort();
+        }
+
+        session_ptr_t sess;
+        LOG_DEBUG("session_key: ",
+                  sess_key.to_string(),
+                  ", current sessions count: ",
+                  table->sessions.size());
+        auto session_it = table->sessions.find(sess_key);
+        if (session_it == table->sessions.end()) {
+            //create new session
+            sess = table->template create_session<observer::scale_out_session, class_id>(
+                sess_key, param, registered_index, registered_devices_count);
+        }
+        else {
+            //renew existing
+            sess = session_it->second;
+            sess->prepare(
+                registered_index, registered_devices_count, reinterpret_cast<void*>(&param));
+
+            //param.reset_counters(registered_index, container.size());
+        }
+
+        // notify actor-owner
+        const auto& thread_map =
+            ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>(
+                scaleout_workers);
+        auto actor_it = thread_map.find(observer_ptr);
+        if (actor_it == thread_map.end()) {
+            LOG_ERROR("Unregistered observer: ",
+                      observer_ptr->to_string(),
+                      ", thread_map size: ",
+                      thread_map.size(),
+                      " . Abort");
+            abort();
+        }
+
+        actor_it->second->start_job(sess);
+    }
+
+private:
+    template <ccl::device_topology_type class_id, class device_t>
+    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
+
+    using specific_devices_tuple_thread_map =
+        observer::multiple_device_thread_map_t<scaleout_actor,
+                                               SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>;
+    specific_devices_tuple_thread_map scaleout_workers;
+
+    template <class device_t>
+    void worker(observer_t<device_t>* device,
+                scaleout_actor* actor_ptr,
+                typename scaleout_actor::storage_t& todo_list);
+    size_t registered_devices_count{};
+
+    std::shared_ptr<ccl::host_communicator> process_communicator;
+};
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp
new file mode 100644
index 000000000..39aed531f
--- /dev/null
+++ b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp
@@ -0,0 +1,134 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp"
+#include "common/log/log.hpp"
+#include "common/comm/host_communicator/host_communicator.hpp"
+
+namespace native {
+
+#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
+#define TEMPLATE_DEF_ARG  Impl, types...
+
+template <TEMPLATE_DECL_ARG>
+void scale_out_ctx<TEMPLATE_DEF_ARG>::initialize_ctx(
+    std::shared_ptr<ccl::host_communicator> communicator) {
+    process_communicator = communicator;
+
+    LOG_INFO("SCALE-OUT context Initialized for mpi rank: (",
+             std::to_string(communicator->rank()),
+             "/",
+             std::to_string(communicator->size()),
+             ")");
+}
+
+// observer_ptr interface implementations
+template <TEMPLATE_DECL_ARG>
+template <ccl::device_topology_type class_id, class device_t>
+void scale_out_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
+                                                             observer_t<device_t>* observer_ptr) {
+    LOG_INFO("scaleout device rank addr: ",
+             std::to_string(rank_addr),
+             ", device: ",
+             observer_ptr->to_string());
+    observer::container_t<observer_t<device_t>>& container =
+        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
+            observables);
+    auto cont_it = container.find(observer_ptr);
+    if (cont_it == container.end()) {
+        container.insert(observer_ptr);
+        // remember total count
+        registered_devices_count++;
+
+        // prepare session tables
+        session_table_initializer init;
+        ccl_tuple_for_each_args(collective_sessions, init, observer_ptr);
+
+        if (rank_addr == std::numeric_limits<size_t>::max()) {
+            return; //nothing to do more
+        }
+    }
+
+    //reassign with index
+    assert(rank_addr != std::numeric_limits<size_t>::max() &&
+           "Reassign with assigned address failed");
+
+    observer::indexed_container_t<observer_t<device_t>>& indexed_container =
+        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
+            indexed_observables);
+
+    auto indexed_it = indexed_container.find(rank_addr);
+    if (indexed_it != indexed_container.end()) {
+        // collect troubleshooting info
+        std::stringstream ss;
+        for (const auto& indexed_dev : indexed_container) {
+            ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string()
+               << "\n";
+        }
+        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
+                                 "- Cannot reassing rank: " + std::to_string(rank_addr) +
+                                 " for SCALEOUT device:\n" + observer_ptr->to_string() +
+                                 "\nBecause it registered already:\n" + ss.str());
+    }
+
+    indexed_container.emplace(rank_addr, observer_ptr);
+
+    // start SCALEOUT worker
+    auto& thread_map =
+        ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>(
+            scaleout_workers);
+    {
+        std::unique_ptr<scaleout_actor> new_actor{ new scaleout_actor(
+            rank_addr, &scale_out_ctx<TEMPLATE_DEF_ARG>::worker<device_t>, this, observer_ptr) };
+        thread_map[observer_ptr] = std::move(new_actor);
+    }
+}
+
+template <TEMPLATE_DECL_ARG>
+template <class device_t>
+void scale_out_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device,
+                                             scaleout_actor* actor_ptr,
+                                             typename scaleout_actor::storage_t& todo_list) {
+    LOG_DEBUG("Start SCALEOUT context worker, Listener device: ",
+              listener_device->to_string(),
+              ",\nactor_id: ",
+              actor_ptr->get_id(),
+              ",\ntodo list size: ",
+              todo_list.size());
+
+    // invoke CPU collective on data chunk
+    for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) {
+        session_ptr_t sess = *sess_it;
+
+        sess->produce_data(process_communicator);
+        ++sess_it;
+    }
+
+    // check CPU collective accomplishment
+    for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) {
+        (*sess_it)->consume_data(0 /*TODO !!!! */, process_communicator);
+        if ((*sess_it)->is_consumed()) {
+            sess_it = todo_list.erase(sess_it);
+        }
+        else {
+            ++sess_it;
+        }
+    }
+}
+
+#undef TEMPLATE_DECL_ARG
+#undef TEMPLATE_DEF_ARG
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp
new file mode 100644
index 000000000..5f27b6845
--- /dev/null
+++ b/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp
@@ -0,0 +1,58 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include <sstream>
+
+#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp"
+#include "common/log/log.hpp"
+#include "common/comm/host_communicator/host_communicator.hpp"
+
+namespace native {
+namespace observer {
+
+std::string scale_out_session_iface::to_string() const {
+    std::stringstream ss;
+    ss << "sess: " << reinterpret_cast<const void*>(this);
+    return ss.str();
+}
+
+size_t scale_out_session_iface::get_send_tag() const {
+    return send_tag;
+}
+
+void ccl_worker_adapter::submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm,
+                                          const session_notification& in,
+                                          session_notification_handle& out,
+                                          const coll_param_gpu& kernel_params) {
+    // allreduce
+    if (kernel_params.get_coll_type() == ccl_coll_allreduce) {
+        out.output_buffer.resize(in.src_size_bytes);
+        ccl::stream::impl_value_t empty_stream{};
+
+        // notice: not thread-safe
+        out.op_handle = comm->allreduce_impl(in.host_src_ptr,
+                                             out.output_buffer.data(),
+                                             in.src_size_bytes,
+                                             kernel_params.get_datatype(),
+                                             kernel_params.get_reduction(),
+                                             empty_stream,
+                                             ccl::default_allreduce_attr,
+                                             {});
+        out.op_handle_ready = true;
+    }
+}
+
+} // namespace observer
+} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp
new file mode 100644
index 000000000..43716108c
--- /dev/null
+++ b/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp
@@ -0,0 +1,171 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+
+#include "oneapi/ccl/event.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
+#include "common/comm/l0/context/scale/numa/numa_session.hpp"
+
+namespace ccl {
+class host_communicator;
+}
+
+namespace native {
+namespace observer {
+
+class scale_out_session_iface {
+public:
+    scale_out_session_iface() = default;
+    virtual ~scale_out_session_iface() = default;
+
+    size_t get_send_tag() const;
+    std::string to_string() const;
+
+    virtual void prepare(size_t observer_domain_index,
+                         size_t observer_domain_count,
+                         void* type_erased_param) = 0;
+    virtual void produce_data(std::shared_ptr<ccl::host_communicator>& comm) = 0;
+    virtual void consume_data(size_t observer_domain_index,
+                              std::shared_ptr<ccl::host_communicator>& comm) = 0;
+    virtual bool is_consumed() noexcept = 0;
+    virtual bool is_produced() noexcept = 0;
+
+private:
+    size_t send_tag{};
+};
+
+struct session_notification_handle {
+    using notification_handle_t = ccl::event;
+    //using notification_handle_ptr_t = std::unique_ptr<notification_handle_t>;
+
+    //TODO use custom allocator instead vector
+    std::vector<uint8_t> output_buffer;
+    notification_handle_t op_handle;
+    //TODO
+    // because notification_handle_t class interface do not provide distinction
+    // between canceled and finished use special flag is denoted extended state of op_handle.
+    // USE event_impl pointer instead! Fix host_communicator to return event_impl!
+    bool op_handle_ready;
+};
+
+struct ccl_worker_adapter {
+    static void submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm,
+                                 const session_notification& in,
+                                 session_notification_handle& out,
+                                 const coll_param_gpu& kernel_params);
+};
+
+template <ccl::device_topology_type class_id, class session_invoke_params>
+struct scale_out_session : public scale_out_session_iface {
+    using base_t = scale_out_session_iface;
+    using invoke_params_t = session_invoke_params;
+    using session_key_t = session_key;
+
+    scale_out_session(producer_description& in_param,
+                      const coll_param_gpu& in_kernel_params,
+                      size_t observer_domain_index,
+                      size_t observer_domain_count,
+                      const session_key_t& key)
+            : base_t(),
+              proxy_session(in_param,
+                            in_kernel_params,
+                            observer_domain_index,
+                            observer_domain_count,
+                            key) {
+        //TODO use `session_invoke_params` information to calculate possible `pending_notifications` reserve
+        // based on chunk size
+        pending_notifications.reserve(16);
+    }
+
+    context_descr& get_ctx_descr() {
+        return proxy_session.get_ctx_descr();
+    }
+
+    void prepare(size_t observer_domain_index,
+                 size_t observer_domain_count,
+                 void* type_erased_param) override {
+        proxy_session.prepare(observer_domain_index, observer_domain_count, type_erased_param);
+
+        auto* out_param = static_cast<invoke_params_t*>(type_erased_param);
+
+        // allocate cpu gw staging slots
+        pending_notifications.clear();
+
+        (void)out_param;
+    }
+
+    void produce_data(std::shared_ptr<ccl::host_communicator>& comm) override {
+        void* partial_chunk = nullptr;
+        size_t partial_chunk_size = 0;
+
+        // get own device partial chunk data
+        proxy_session.produce_data(&partial_chunk, partial_chunk_size);
+        if (partial_chunk_size > 0) {
+            // notify other scaleout actors in other processes about partial my result
+            session_notification notif(partial_chunk, partial_chunk_size);
+            session_notification_handle handle;
+
+            ccl_worker_adapter::submit_coll_work(
+                comm, notif, handle, proxy_session.get_kernel_params());
+
+            pending_notifications.push_back(std::move(handle));
+        }
+    }
+
+    void consume_data(size_t observer_domain_index,
+                      std::shared_ptr<ccl::host_communicator>& comm) override {
+        for (auto it = pending_notifications.begin(); it != pending_notifications.end(); ++it) {
+            if (it->op_handle_ready) { // notice: not thread-safe
+
+                if (it->op_handle.test()) {
+                    proxy_session.consume_data(
+                        observer_domain_index,
+                        it->output_buffer.data(),
+                        it->output_buffer.size() *
+                            ccl::get_datatype_size(
+                                proxy_session.get_kernel_params().get_datatype()));
+
+                    // notice: not thread-safe
+                    it->op_handle_ready = false;
+                }
+                else {
+                    //TODO collectives on CPU side are processing sequencially
+                    // if the first handle is not ready yet, then skip following handles
+                    break;
+                }
+            }
+        }
+    }
+
+    bool is_consumed() noexcept override {
+        return proxy_session.is_consumed();
+    }
+
+    bool is_produced() noexcept override {
+        return proxy_session.is_produced();
+    }
+
+private:
+    void notify_data();
+    numa_session<class_id, invoke_params_t> proxy_session;
+    std::vector<session_notification_handle> pending_notifications;
+};
+} // namespace observer
+} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp
similarity index 100%
rename from src/common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp
rename to src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp
diff --git a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp
similarity index 97%
rename from src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp
rename to src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp
index ba0486216..096aa722e 100644
--- a/src/common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp
+++ b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #pragma once
-#include "common/comm/l0/context/scaling_ctx/scale_up_ctx.hpp"
+#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp"
 
 namespace native {
 #define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
diff --git a/src/common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp b/src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp
similarity index 100%
rename from src/common/comm/l0/context/scaling_ctx/scaling_context_dispatcher.hpp
rename to src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp
diff --git a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp b/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp
deleted file mode 100644
index a3b035f7b..000000000
--- a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <sstream>
-
-#include "common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp"
-#include "common/log/log.hpp"
-
-namespace native {
-namespace observer {
-
-session::session()
-        : host_producer_memory(nullptr),
-          host_producer_ready_bytes(nullptr),
-          host_consumed_bytes(0),
-          host_expected_bytes(0),
-
-          device_consumer_total_memory(nullptr),
-          device_consumer_ready_bytes(nullptr),
-          device_produced_bytes(0),
-          copy_immediate_list() {}
-
-std::string session::to_string() const {
-    std::stringstream ss;
-    ss << "sess: " << reinterpret_cast<const void*>(this);
-    return ss.str();
-}
-
-size_t session::get_send_tag() const {
-    return send_tag;
-}
-
-size_t session::produce_data(void** out_chunk, size_t& out_chunk_size) {
-    //read ready flag
-    size_t old_consumed = host_consumed_bytes;
-    int total_produced = *host_producer_ready_bytes;
-
-    size_t to_consume = total_produced - old_consumed;
-    if (to_consume) {
-        //fence
-        LOG_TRACE(to_string(),
-                  " - bytes produced: ",
-                  total_produced,
-                  ", previously bytes consumed: ",
-                  old_consumed);
-        std::atomic_thread_fence(std::memory_order::memory_order_seq_cst);
-
-        // do not read data here!
-        *out_chunk = (static_cast<uint8_t*>(host_producer_memory) + old_consumed);
-
-        //check finalize
-        host_consumed_bytes = to_consume;
-    }
-
-    out_chunk_size = to_consume;
-    return to_consume;
-}
-
-bool session::consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size) {
-    /* TODO create event
-     * ze_event_handle_t mem_event {};
-     */
-
-    ze_result_t res = zeCommandListAppendMemoryCopy(
-        copy_immediate_list,
-        (static_cast<uint8_t*>(device_consumer_total_memory) + device_produced_bytes),
-        in_chunk,
-        in_chunk_size,
-        /*mem_event*/ nullptr,
-        0,
-        nullptr);
-    if (res != ZE_RESULT_SUCCESS) {
-        throw std::runtime_error(
-            std::string(
-                "cannot append copy NUMA host to device memory for partial result, error: ") +
-            native::to_string(res));
-    }
-    device_produced_bytes += in_chunk_size;
-
-    res = zeCommandListAppendMemoryCopy(copy_immediate_list,
-                                        device_consumer_ready_bytes,
-                                        &device_produced_bytes,
-                                        sizeof(device_produced_bytes),
-                                        nullptr,
-                                        1,
-                                        /*&mem_event*/ nullptr);
-    if (res != ZE_RESULT_SUCCESS) {
-        throw std::runtime_error(
-            std::string("cannot append copy NUMA host to device memory for ready bytes, error: ") +
-            native::to_string(res));
-    }
-    return device_produced_bytes == host_expected_bytes;
-}
-
-size_t session_table::get_unique_tag() {
-    static std::atomic<size_t> tag_counter{ 1 };
-    return tag_counter.fetch_add(1);
-}
-
-std::string session_table::to_string() const {
-    std::stringstream ss;
-    ss << "sessions count: " << sessions.size() << std::endl;
-    for (const auto& val : sessions) {
-        ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n"
-           << val.second->to_string() << std::endl;
-    }
-    return ss.str();
-}
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp b/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp
deleted file mode 100644
index 077186e65..000000000
--- a/src/common/comm/l0/context/scaling_ctx/observer_ctx_session.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-#include <map>
-#include <memory>
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
-#include "common/comm/l0/modules/supported_modules.hpp"
-
-namespace native {
-namespace observer {
-
-/* Low levels session
- * contains raw data for net operations
- */
-class session {
-public:
-    session();
-    virtual ~session() = default;
-
-    virtual void prepare(size_t observer_domain_index,
-                         size_t observer_domain_count,
-                         void* type_erased_param) = 0;
-
-    size_t get_send_tag() const;
-    std::string to_string() const;
-
-    size_t produce_data(void** out_chunk, size_t& out_chunk_size);
-    bool consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size);
-
-private:
-    size_t send_tag{};
-
-    // low level data
-    void* host_producer_memory;
-    counter_type* host_producer_ready_bytes;
-    size_t host_consumed_bytes;
-    size_t host_expected_bytes;
-
-    void* device_consumer_total_memory;
-    counter_type* device_consumer_ready_bytes;
-    size_t device_produced_bytes;
-
-    ze_command_list_handle_t copy_immediate_list;
-};
-
-struct session_notification {
-    session_notification(void* addr, size_t size_bytes)
-            : host_src_ptr(addr),
-              src_size_bytes(size_bytes) {}
-    void* host_src_ptr;
-    size_t src_size_bytes;
-};
-
-using shared_session_ptr = std::shared_ptr<session>;
-
-/* High level session
- * Contains collective communication data
- */
-template <ccl_coll_type coll_type, class kernel_params, ccl::device_topology_type class_id>
-struct typed_session : public session {
-    typed_session(producer_description& in_param,
-                  size_t observer_domain_index,
-                  size_t observer_domain_count) {
-        params.init(in_param.staged_buffer_elem_count,
-                    observer_domain_index,
-                    observer_domain_count,
-                    in_param.context,
-                    in_param.device);
-    }
-
-    const context_description<coll_type, typename kernel_params::native_type>&
-    get_context_description() const {
-        return params;
-    }
-
-    void prepare(size_t observer_domain_index,
-                 size_t observer_domain_count,
-                 void* type_erased_param) override {
-        auto* out_param = static_cast<invoke_params<coll_type, kernel_params>*>(type_erased_param);
-        params.reset_staged_counters(observer_domain_index, observer_domain_count);
-
-        out_param->set_out_params(params);
-    }
-
-private:
-    context_description<coll_type, typename kernel_params::native_type> params;
-};
-
-// session owner
-// TODO not thread-safe
-struct session_table {
-    using session_key_t = session_key;
-
-    template <ccl::device_topology_type class_id, class invoke_params_type>
-    std::shared_ptr<session> create_session(const session_key_t& key,
-                                            invoke_params_type& params,
-                                            size_t observer_domain_index,
-                                            size_t observer_domain_count) {
-        using specific_session = typed_session<invoke_params_type::get_coll_type(),
-                                               typename invoke_params_type::kernel_params_t,
-                                               class_id>;
-        auto sess = std::make_shared<specific_session>(
-            params.get_producer_params(), observer_domain_index, observer_domain_count);
-
-        params.set_out_params(sess->get_context_description());
-        sessions.emplace(key, sess);
-
-        return sess;
-    }
-
-    std::string to_string() const;
-    std::map<session_key_t, shared_session_ptr> sessions{};
-
-    static size_t get_unique_tag();
-};
-
-using shared_session_table_ptr = std::shared_ptr<session_table>;
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp b/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp
deleted file mode 100644
index ed0b366ce..000000000
--- a/src/common/comm/l0/context/scaling_ctx/observer_session_key.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "oneapi/ccl/native_device_api/l0/device.hpp"
-#include "oneapi/ccl/native_device_api/l0/context.hpp"
-#include "coll/algorithms/algorithms_enum.hpp"
-
-namespace native {
-namespace observer {
-using counter_type = uint64_t;
-struct producer_description {
-    size_t world_rank;
-    size_t world_size;
-    counter_type staged_buffer_elem_count;
-
-    std::shared_ptr<ccl_context> context;
-    ccl_device& device;
-    ccl_device::device_cmd_list immediate_list; //TODO make persisten
-};
-
-//TODO looks like these structure is specific for allreduce only
-template <ccl_coll_type type, class native_data_type>
-struct context_description {
-    // produced by kernel
-    ccl_context::host_memory_ptr<native_data_type> numa_staged_memory;
-    ccl_context::host_memory_ptr<counter_type> staged_memory_size_counter;
-
-    // consumed by kernel
-    // (TODO consider using 'recv_buff' from collective entry)
-    // to reduce copy iterations
-    ccl_device::device_memory_ptr<counter_type> producer_aggregated_memory_offset;
-
-    ccl_device::device_memory_ptr<native_data_type> total_producers_aggregated_memory;
-    ccl_device::device_memory_ptr<counter_type> total_producers_aggregated_size_counter;
-
-    void init(size_t staged_buffer_elem_count,
-              size_t observer_domain_index,
-              size_t observer_domain_count,
-              std::shared_ptr<ccl_context>& context,
-              ccl_device& device) {
-        // create staged mem in host context
-        ze_host_mem_alloc_desc_t host_descr = ccl_context::get_default_host_alloc_desc();
-        host_descr.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED;
-
-        numa_staged_memory = context->template alloc_memory<native_data_type>(
-            staged_buffer_elem_count,
-            /*TODO use page size*/ sizeof(native_data_type),
-            host_descr);
-
-        // create staged mem counter in host context
-        staged_memory_size_counter = context->template alloc_memory<counter_type>(
-            1, /*TODO use page size*/ sizeof(counter_type), host_descr);
-
-        ze_device_mem_alloc_desc_t mem_descr = ccl_device::get_default_mem_alloc_desc();
-
-        // create total aggregated memory in device context
-        mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED;
-        total_producers_aggregated_memory = device.template alloc_memory_ptr<native_data_type>(
-            staged_buffer_elem_count * observer_domain_count,
-            sizeof(native_data_type),
-            context,
-            mem_descr);
-
-        // create offset in device context
-        mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
-        producer_aggregated_memory_offset = device.template alloc_memory_ptr<counter_type>(
-            1, sizeof(counter_type), context, mem_descr);
-
-        // create aggregated counter in device context
-        total_producers_aggregated_size_counter = device.template alloc_memory_ptr<counter_type>(
-            1, sizeof(counter_type), context, mem_descr);
-
-        // init values
-        reset_staged_counters(observer_domain_index, observer_domain_count);
-    }
-
-    void reset_staged_counters(size_t observer_domain_index, size_t observer_domain_count) {
-        counter_type filled_counter_value = 0;
-        staged_memory_size_counter->enqueue_write_sync(&filled_counter_value, 1);
-
-        filled_counter_value = observer_domain_index * numa_staged_memory->count();
-        ;
-        producer_aggregated_memory_offset->enqueue_write_sync(&filled_counter_value, 1);
-
-        filled_counter_value = 0;
-        total_producers_aggregated_size_counter->enqueue_write_sync(&filled_counter_value, 1);
-    }
-};
-
-template <ccl_coll_type type, class kernel_params>
-struct invoke_params {
-    using kernel_params_t = kernel_params;
-
-    static constexpr ccl_coll_type get_coll_type() {
-        return type;
-    }
-
-    invoke_params(producer_description&& in)
-            : in_params(std::move(in)),
-              out_params(),
-              valid(false) {}
-
-    void set_out_params(
-        const context_description<type, typename kernel_params_t::native_type>& src) {
-        out_params = src;
-        valid = true;
-    }
-
-    bool is_valid() const {
-        return valid;
-    }
-
-    const producer_description& get_producer_params() const {
-        return in_params;
-    }
-
-    producer_description& get_producer_params() {
-        return in_params;
-    }
-
-    const context_description<type, typename kernel_params_t::native_type>& get_ctx_params() const {
-        if (!is_valid()) {
-            throw std::runtime_error("observer invocation params are not ready");
-        }
-        return out_params;
-    }
-
-private:
-    producer_description in_params;
-    context_description<type, typename kernel_params_t::native_type> out_params;
-    bool valid;
-};
-
-struct session_key {
-    using hash_core_t = size_t;
-
-    friend std::ostream& operator<<(std::ostream& out, const session_key& key) {
-        out << key.to_string();
-        return out;
-    }
-
-    template <class T>
-    session_key(const T* src) : hash(std::hash<const T*>{}(src)) {}
-
-    bool operator<(const session_key& other) const noexcept;
-
-    std::string to_string() const;
-
-private:
-    hash_core_t hash;
-};
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp b/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp
deleted file mode 100644
index 89cc9852f..000000000
--- a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/base_scaling_ctx.hpp"
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-template <class device>
-class ccl_scaleout_proxy;
-
-template <class device>
-class ccl_gpu_scaleup_proxy;
-
-template <class device>
-class ccl_numa_proxy;
-
-template <class Impl, ccl::device_topology_type... types>
-class scale_out_ctx
-        : public observer::base_scaling_ctx<
-              scale_out_ctx<Impl, types...>,
-              ccl_scaleout_proxy<ccl_gpu_comm>,
-              ccl_scaleout_proxy<ccl_virtual_gpu_comm>,
-              ccl_scaleout_proxy<ccl_numa_proxy<ccl_gpu_comm>>,
-              ccl_scaleout_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>,
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>,
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>,
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>,
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>> {
-public:
-    using context_impl = Impl;
-
-    template <class device_t>
-    using observer_t = ccl_scaleout_proxy<device_t>;
-
-    using scaling_ctx_base_t = observer::base_scaling_ctx<
-        scale_out_ctx<Impl, types...>,
-        observer_t<ccl_gpu_comm>,
-        observer_t<ccl_virtual_gpu_comm>,
-        observer_t<ccl_numa_proxy<ccl_gpu_comm>>,
-        observer_t<ccl_numa_proxy<ccl_virtual_gpu_comm>>,
-        observer_t<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>,
-        observer_t<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>,
-        observer_t<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>,
-        observer_t<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>>;
-
-    using observable_scale_up_topologies =
-        typename scaling_ctx_base_t::template observable_topologies<types...>;
-
-    observable_scale_up_topologies observables;
-
-    //observer subject interface implementations
-    template <class device_t, ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    template <class device_t, ccl::device_topology_type class_id, class invoke_params_t>
-    void invoke_ctx_observer(observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, class_id> val,
-                             const observer::session_key& sess_key,
-                             invoke_params_t& param) {
-        throw std::runtime_error("SCALE_OUT invoke is not implemented yet");
-    }
-
-private:
-    template <ccl::device_topology_type topology_type, class device_t>
-    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr); /*
-    {
-        auto &topologu_specific_observers = std::get<topology_index>(observables);
-        container_t<device_t>& container = std::get<device_t::type_idx()>(topologu_specific_observers);
-        container.insert(observer);
-    }*/
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp b/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp
deleted file mode 100644
index 9d1a94873..000000000
--- a/src/common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/scaling_ctx/scale_out_ctx.hpp"
-#include "common/log/log.hpp"
-
-namespace native {
-
-#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
-#define TEMPLATE_DEF_ARG  Impl, types...
-
-// observer_ptr interface implementations
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type topology_type, class device_t>
-void scale_out_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                             observer_t<device_t>* observer_ptr) {
-    observer::container_t<observer_t<device_t>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, topology_type>(
-            observables);
-    container.insert(observer_ptr);
-}
-
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-} // namespace native
diff --git a/src/common/comm/l0/context/thread_group_ctx.cpp b/src/common/comm/l0/context/thread_group_ctx.cpp
index 78a76411e..a5e7aa16a 100644
--- a/src/common/comm/l0/context/thread_group_ctx.cpp
+++ b/src/common/comm/l0/context/thread_group_ctx.cpp
@@ -19,7 +19,7 @@
 #include "common/comm/l0/context/device_storage.hpp"
 
 #include "common/comm/l0/scheduler/thread_group_scheduler.hpp"
-#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
 
 namespace native {
 
diff --git a/src/common/comm/l0/context/thread_group_ctx.hpp b/src/common/comm/l0/context/thread_group_ctx.hpp
index 4cfd0acf7..3e80a235f 100644
--- a/src/common/comm/l0/context/thread_group_ctx.hpp
+++ b/src/common/comm/l0/context/thread_group_ctx.hpp
@@ -17,7 +17,7 @@
 #include "common/comm/l0/context/device_group_ctx.hpp"
 #include "common/log/log.hpp"
 
-#include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
 
 namespace native {
 struct device_storage;
diff --git a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
index c8a647869..d00a5dde4 100644
--- a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
@@ -41,12 +41,8 @@ class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>,
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     static constexpr const char* name_impl() {
         return "CONCURRENT_GPU";
@@ -73,11 +69,10 @@ class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>,
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
-        return next_thread_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
+        return next_thread_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(
+            params);
     }
 
     device_t& get_impl_device() {
diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_comm.hpp
index 0cdfae477..e201db630 100644
--- a/src/common/comm/l0/devices/ccl_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_gpu_comm.hpp
@@ -91,12 +91,8 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     using supported_modules = supported_device_modules<gpu_module_t>;
 
@@ -128,27 +124,33 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>
 
     std::string to_string_impl() const;
 
+    // template <ccl_coll_type module_type,
+    //           ccl::group_split_type group_id,
+    //           ccl::device_topology_type class_id,
+    //           class kernel_params>
+    // gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel() {
+    //     auto& ptr = get_gpu_module<module_type, group_id, class_id>();
+
+    //     using requested_class = kernel_class_t<module_type, group_id, class_id>;
+    //     return ptr.template get_class<requested_class>().template get<kernel_params>();
+    // }
+
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         auto& ptr = get_gpu_module<module_type, group_id, class_id>();
 
         using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().template get<kernel_params>();
+        return ptr.template get_class<requested_class>().get(params);
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
 
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size); //threads count!!!
         return main_func;
diff --git a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
index 7e221846f..10fd7b51f 100644
--- a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
@@ -47,12 +47,8 @@ class ccl_gpu_scaleup_proxy
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
 
@@ -76,13 +72,11 @@ class ccl_gpu_scaleup_proxy
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         this->template invoke<group_id, class_id>();
 
-        return wrapped_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
     template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
@@ -90,17 +84,15 @@ class ccl_gpu_scaleup_proxy
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
@@ -139,12 +131,8 @@ class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
     using device_impl_t = ccl_numa_proxy<device_t>;
@@ -174,25 +162,21 @@ class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         this->template invoke<group_id>();
-        return wrapped_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
index 5e9be2de8..10153d131 100644
--- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
@@ -50,12 +50,8 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     using supported_modules = supported_device_modules<gpu_module_t>;
 
@@ -74,16 +70,15 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         auto& ptr =
             base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
                 registered_modules);
         assert(ptr);
 
         using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr->template get_class<requested_class>().template get<kernel_params>();
+        return ptr->template get_class<requested_class>().get(params);
     }
 
     template <ccl_coll_type module_type,
diff --git a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
index 46b73f1a4..cc39a0084 100644
--- a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
@@ -22,7 +22,7 @@
 
 #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
 #include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
 
 #include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
 namespace native {
@@ -53,12 +53,8 @@ class ccl_ipc_source_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_source_gpu_comm
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     static constexpr const char* name_impl() {
         return "SOURCE_IPC_GPU";
@@ -135,30 +131,24 @@ class ccl_ipc_source_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_source_gpu_comm
 */
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
-        return inprocess_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
+        return inprocess_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         static_assert(group_id == ccl::group_split_type::cluster,
                       "ccl_ipc_source_gpu_comm available for ccl::group_split_type::cluster only");
         const topology_addr<group_id, class_id>& comm_addr =
-            base::template get_comm_data<group_id, class_id>();
+            inprocess_gpu_comm.template get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
 
-        ipc_invoke_params<gpu_entry::type(), kernel_params> params(entry.get_ipc_data());
+        ipc_invoke_params<gpu_entry::type()> params(entry.get_ipc_data(), entry.get_params());
         this->template invoke<group_id, class_id>(entry.get_ipc_session_key(), std::move(params));
 
         return main_func;
diff --git a/src/common/comm/l0/devices/ccl_numa_proxy.hpp b/src/common/comm/l0/devices/ccl_numa_proxy.hpp
index fee81eb87..efd29e93b 100644
--- a/src/common/comm/l0/devices/ccl_numa_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_numa_proxy.hpp
@@ -22,7 +22,7 @@
 
 #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
 #include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
 
 namespace native {
 
@@ -46,12 +46,8 @@ class ccl_numa_proxy
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::numa_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     static constexpr const char* name_impl() {
         return "NUMA_PROXY";
@@ -73,21 +69,16 @@ class ccl_numa_proxy
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>();
 
         using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().template get<kernel_params>();
+        return ptr.template get_class<requested_class>().get(params);
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         static_assert(group_id == ccl::group_split_type::cluster,
                       "ccl_numa_proxy available for ccl::group_split_type::cluster only");
 
@@ -95,31 +86,22 @@ class ccl_numa_proxy
             base::template get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>;
+        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>;
         kernel_func_type& main_func =
-            get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+            get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
 
         // alloc shared data structure to notify host side with device parital result
-        observer::invoke_params<gpu_entry::type(), kernel_params> params = entry.get_numa_data();
+        observer::invoke_params<gpu_entry::type()> params = entry.get_numa_data();
 
         // invoke host-side context creation
         this->template invoke<group_id, class_id>(entry.get_numa_session_key(), params);
 
         // bind shared data to kernel
         const auto& out_ctx_params = params.get_ctx_params();
-        main_func.template set_arg<typename kernel_func_type::event_prod_chunk_mem_arg>(
-            out_ctx_params.numa_staged_memory->get());
-        main_func.template set_arg<typename kernel_func_type::event_prod_bytes_arg>(
-            out_ctx_params.staged_memory_size_counter->get());
-
-        main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_offset_arg>(
-            out_ctx_params.producer_aggregated_memory_offset->get());
-        main_func.template set_arg<typename kernel_func_type::event_consumed_chunk_mem_arg>(
-            out_ctx_params.total_producers_aggregated_memory->get());
-        main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_arg>(
-            out_ctx_params.total_producers_aggregated_size_counter->get());
+
+        main_func.bind_data(out_ctx_params);
 
         return main_func;
     }
diff --git a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
index 2cd6ce8f2..e08545b53 100644
--- a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
@@ -22,7 +22,7 @@
 
 #include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
 #include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
 
 namespace native {
 
@@ -52,12 +52,8 @@ class ccl_scaleout_proxy
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     static constexpr const char* name_impl() {
         return "SCALE_OUT_PROXY";
@@ -79,13 +75,12 @@ class ccl_scaleout_proxy
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>();
 
         using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().template get<kernel_params>();
+        return ptr.template get_class<requested_class>().get(params);
     }
 
     template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
@@ -93,43 +88,32 @@ class ccl_scaleout_proxy
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>;
+        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>;
 
         kernel_func_type& main_func =
-            get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+            get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
 
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
 
         // alloc shared data structure to notify host side with device parital result
-        observer::invoke_params<gpu_entry::type(), kernel_params> params = entry.get_numa_data();
+        observer::invoke_params<gpu_entry::type()> params = entry.get_scaleout_data();
 
         // invoke host-side context creation
-        this->template invoke<group_id, class_id>(entry.get_numa_session_key(), params);
+        this->template invoke<group_id, class_id>(entry.get_scaleout_session_key(), params);
 
         // bind shared data to kernel
         const auto& out_ctx_params = params.get_ctx_params();
-        main_func.template set_arg<typename kernel_func_type::event_prod_chunk_mem_arg>(
-            out_ctx_params.numa_staged_memory->get());
-        main_func.template set_arg<typename kernel_func_type::event_prod_bytes_arg>(
-            out_ctx_params.staged_memory_size_counter->get());
-
-        main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_offset_arg>(
-            out_ctx_params.producer_aggregated_memory_offset->get());
-        main_func.template set_arg<typename kernel_func_type::event_consumed_chunk_mem_arg>(
-            out_ctx_params.total_producers_aggregated_memory->get());
-        main_func.template set_arg<typename kernel_func_type::event_consumed_bytes_arg>(
-            out_ctx_params.total_producers_aggregated_size_counter->get());
+
+        main_func.bind_data(out_ctx_params);
 
         return main_func;
     }
@@ -168,10 +152,9 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>>
 
     template <ccl_coll_type algo_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    using gpu_kernel_t = typename gpu_module_t<algo_type, group_id, class_id>::
-        scale_out_cpu_gw_class::template kernel_t<kernel_params>;
+              ccl::device_topology_type class_id>
+    using gpu_kernel_t =
+        typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t;
 
     //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
     using device_impl_t = ccl_numa_proxy<device_t>;
@@ -196,13 +179,11 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>>
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         this->template invoke<group_id>();
 
-        return wrapped_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
     template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
@@ -210,17 +191,15 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
@@ -258,10 +237,9 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>
 
     template <ccl_coll_type algo_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    using gpu_kernel_t = typename gpu_module_t<algo_type, group_id, class_id>::
-        scale_out_cpu_gw_class::template kernel_t<kernel_params>;
+              ccl::device_topology_type class_id>
+    using gpu_kernel_t =
+        typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t;
 
     //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
     using device_impl_t = ccl_gpu_scaleup_proxy<device_t>;
@@ -286,13 +264,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         this->template invoke<group_id>();
 
-        return wrapped_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
     template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
@@ -300,17 +276,15 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
@@ -346,13 +320,9 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_kernel_t =
-        typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class::template kernel_t<
-            kernel_params>;
+        typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class::kernel_t;
 
     //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
     using device_impl_t = ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>;
@@ -377,13 +347,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         this->template invoke<group_id>();
 
-        return wrapped_gpu_comm
-            .template get_gpu_kernel<module_type, group_id, class_id, kernel_params>();
+        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
     }
 
     template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
@@ -391,17 +359,15 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
+    template <ccl::group_split_type group_id,
               ccl::device_topology_type class_id,
               class gpu_entry,
               class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
index a6cc90ccd..6334b56a4 100644
--- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
@@ -33,12 +33,8 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_
     template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode,
-              class kernel_params>
-    using gpu_kernel_t =
-        typename kernel_class_t<algo_type, group, mode>::template kernel_t<kernel_params>;
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
+    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
 
     using supported_modules = supported_device_modules<gpu_module_t>;
 
@@ -69,25 +65,20 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_
 
     template <ccl_coll_type module_type,
               ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class kernel_params>
-    gpu_kernel_t<module_type, group_id, class_id, kernel_params>& get_gpu_kernel() {
+              ccl::device_topology_type class_id>
+    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
         auto& ptr = get_gpu_module<module_type, group_id, class_id>();
 
         using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().template get<kernel_params>();
+        return ptr.template get_class<requested_class>().get(params);
     }
 
-    template <class kernel_params,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id, kernel_params>& register_entry(
-        gpu_entry& entry) {
+    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
+    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
         LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
 
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id, kernel_params>();
+        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
         main_func.set_rank(comm_addr.rank);
         main_func.set_size(comm_addr.size);
         return main_func;
diff --git a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp b/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp
index 322b0c6b7..392d9c316 100644
--- a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp
+++ b/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp
@@ -18,294 +18,219 @@
 
 namespace native {
 
-template <class kernel_params>
-struct a2a_allreduce_kernel
-        : public execution_kernel<
-              a2a_allreduce_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+namespace a2a {
 
-    static constexpr const char* specific_name() {
-        return "allreduce_execution";
-    }
+namespace allreduce {
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+/**
+ * Common args for all kernel types
+ */
 
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
+// own
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
+using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
 
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
 
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
+template <class native_t>
+using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>;
 
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+template <class native_t>
+using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
 
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
+using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
+using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
 
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+// right
+template <class native_t>
+using right_tmp_recv_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
+
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
+
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
 
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel : public execution_kernel<main_kernel,
+                                             send_buf_size_arg,
+                                             send_buf_arg<void>,
+                                             recv_buf_arg<void>,
+                                             tmp_recv_buf_arg<void>,
+                                             income_data_flag_arg,
+                                             ready_to_recv_flag_arg,
+                                             local_barrier_flag_arg,
+                                             right_tmp_recv_buf_arg<void>,
+                                             right_income_data_flag_arg,
+                                             right_ready_to_recv_flag_arg> {
+    using processing_type = void;
 
-    using base = execution_kernel<ring_allreduce_kernel<kernel_params>,
+    static constexpr const char* specific_name() {
+        return "allreduce_execution";
+    }
+
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct a2a_allreduce_numa_kernel
-        : public execution_kernel<
-              a2a_allreduce_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 10,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  send_buf_size_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
+                                  income_data_flag_arg,
+                                  ready_to_recv_flag_arg,
+                                  local_barrier_flag_arg,
+                                  right_tmp_recv_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
+
+                                  // numa-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 10, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 11, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allreduce_execution_numa";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
     using event_prod_chunk_mem_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<a2a_allreduce_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct a2a_allreduce_ipc
-        : public ipc_kernel<a2a_allreduce_ipc<kernel_params>,
-                            stub_arg<main_kernel_args::args_start_index>,
-                            stub_arg<main_kernel_args::args_start_index + 1>,
-                            stub_arg<main_kernel_args::args_start_index + 2>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 3,
-                                            typename kernel_params::native_type*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-                            stub_arg<main_kernel_args::args_start_index + 6>,
-                            stub_arg<main_kernel_args::args_start_index + 7>,
-                            stub_arg<main_kernel_args::args_start_index + 8>,
-                            stub_arg<main_kernel_args::args_start_index + 9>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           stub_arg<main_kernel_args::args_start_index>,
+                                           stub_arg<main_kernel_args::args_start_index + 1>,
+                                           stub_arg<main_kernel_args::args_start_index + 2>,
+                                           tmp_recv_buf_arg<void>,
+                                           income_data_flag_arg,
+                                           ready_to_recv_flag_arg,
+                                           stub_arg<main_kernel_args::args_start_index + 6>,
+                                           stub_arg<main_kernel_args::args_start_index + 7>,
+                                           stub_arg<main_kernel_args::args_start_index + 8>,
+                                           stub_arg<main_kernel_args::args_start_index + 9>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "a2a_allreduce_ipc";
     }
 
-    using tmp_recv_buf_arg = typename ring_allreduce_kernel<kernel_params>::tmp_recv_buf_arg;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg =
-        typename ring_allreduce_kernel<kernel_params>::income_data_flag_arg;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg =
-        typename ring_allreduce_kernel<kernel_params>::ready_to_recv_flag_arg;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 stub_arg<main_kernel_args::args_start_index>,
+                                 stub_arg<main_kernel_args::args_start_index + 1>,
+                                 stub_arg<main_kernel_args::args_start_index + 2>,
+                                 tmp_recv_buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 stub_arg<main_kernel_args::args_start_index + 6>,
+                                 stub_arg<main_kernel_args::args_start_index + 7>,
+                                 stub_arg<main_kernel_args::args_start_index + 8>,
+                                 stub_arg<main_kernel_args::args_start_index + 9>>;
+
+    using base::base;
+};
 
-    using base = execution_kernel<a2a_allreduce_ipc<kernel_params>,
-                                  stub_arg<main_kernel_args::args_start_index>,
-                                  stub_arg<main_kernel_args::args_start_index + 1>,
-                                  stub_arg<main_kernel_args::args_start_index + 2>,
-                                  tmp_recv_buf_arg,
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  send_buf_size_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
-                                  stub_arg<main_kernel_args::args_start_index + 6>,
-                                  stub_arg<main_kernel_args::args_start_index + 7>,
-                                  stub_arg<main_kernel_args::args_start_index + 8>,
-                                  stub_arg<main_kernel_args::args_start_index + 9>>;
-};
+                                  local_barrier_flag_arg,
+                                  right_tmp_recv_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
 
-template <class native_type>
-struct a2a_allreduce_scale_out_cpu_gw_kernel
-        : public execution_kernel<
-              a2a_allreduce_scale_out_cpu_gw_kernel<native_type>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, native_type*>,
-              arg<main_kernel_args::args_start_index + 2, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-
-              thread_safe_arg<main_kernel_args::args_start_index + 10, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11, int*>> {
-    using processing_type = native_type;
+                                  // scaleout-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 10, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 11, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allreduce_execution_scale_out_cpu_gw";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
     using event_prod_chunk_mem_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 10, native_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<a2a_allreduce_scale_out_cpu_gw_kernel<native_type>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    using base::base;
 };
 
+} // namespace allreduce
+} // namespace a2a
 } // namespace native
diff --git a/src/common/comm/l0/modules/a2a/allreduce_module.hpp b/src/common/comm/l0/modules/a2a/allreduce_module.hpp
index 25f9f2405..41c3eb648 100644
--- a/src/common/comm/l0/modules/a2a/allreduce_module.hpp
+++ b/src/common/comm/l0/modules/a2a/allreduce_module.hpp
@@ -22,22 +22,22 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_allreduce,
                                  ccl::device_topology_type::a2a,
-                                 a2a_allreduce_kernel,
-                                 a2a_allreduce_numa_kernel,
-                                 a2a_allreduce_scale_out_cpu_gw_kernel);
+                                 a2a::allreduce::main_kernel,
+                                 a2a::allreduce::numa_kernel,
+                                 a2a::allreduce::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_allreduce,
                                  ccl::device_topology_type::a2a,
-                                 a2a_allreduce_ipc,
-                                 a2a_allreduce_ipc,
-                                 a2a_allreduce_ipc);
+                                 a2a::allreduce::ipc_kernel,
+                                 a2a::allreduce::ipc_kernel,
+                                 a2a::allreduce::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce,
                                 ccl::device_topology_type::a2a,
-                                a2a_allreduce_kernel,
-                                a2a_allreduce_numa_kernel,
-                                a2a_allreduce_scale_out_cpu_gw_kernel);
+                                a2a::allreduce::main_kernel,
+                                a2a::allreduce::numa_kernel,
+                                a2a::allreduce::scale_out_cpu_gw_kernel);
 
 } // namespace native
diff --git a/src/common/comm/l0/modules/gpu_typed_module.hpp b/src/common/comm/l0/modules/gpu_typed_module.hpp
index 744d47be9..9825a527e 100644
--- a/src/common/comm/l0/modules/gpu_typed_module.hpp
+++ b/src/common/comm/l0/modules/gpu_typed_module.hpp
@@ -26,11 +26,11 @@
 namespace native {
 
 template <ccl_coll_type type,
-          template <typename>
+          //   template <typename>
           class kernel_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_numa_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_scale_out_cpu_gw_function_impl>
 struct real_gpu_typed_module : private gpu_module_base,
                                public kernel_class<type, kernel_function_impl>,
@@ -49,17 +49,25 @@ struct real_gpu_typed_module : private gpu_module_base,
                   ccl_coll_type_to_str(type),
                   ", modules handle: ",
                   (void*)module);
-        ccl_tuple_for_each(main_class::value,
-                           detail::kernel_entry_initializer<type>(
-                               [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                                   return this->import_kernel(name);
-                               }));
 
-        ccl_tuple_for_each(numa_class::value,
+        // TODO: is there a nicer way to iterate?
+        for (auto&& kernel_node : main_class::value) {
+            detail::kernel_entry_initializer<type>(
+                [this](const std::string& name) -> gpu_module_base::kernel_handle {
+                    return this->import_kernel(name);
+                })(kernel_node.second);
+        }
+        // ccl_tuple_for_each(main_class::value,
+        //                    detail::kernel_entry_initializer<type>(
+        //                        [this](const std::string& name) -> gpu_module_base::kernel_handle {
+        //                            return this->import_kernel(name);
+        //                        }));
+
+        /*ccl_tuple_for_each(numa_class::value,
                            detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
-                               }));
+                               }));*/
 
         LOG_DEBUG("Imported functions count: ", functions.size());
     }
@@ -86,11 +94,11 @@ struct real_gpu_typed_module : private gpu_module_base,
 
 //2) virtual ipc_gpu_typed_module
 template <ccl_coll_type type,
-          template <typename>
+          //   template <typename>
           class kernel_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_numa_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_scale_out_cpu_gw_function_impl>
 struct ipc_gpu_typed_module : private gpu_module_base,
                               public kernel_class<type, kernel_function_impl> {
@@ -102,11 +110,17 @@ struct ipc_gpu_typed_module : private gpu_module_base,
 
     ipc_gpu_typed_module(handle module_handle) : gpu_module_base(nullptr) {
         LOG_DEBUG("Remote gpu module created: ", ccl_coll_type_to_str(type));
-        ccl_tuple_for_each(main_class::value,
-                           detail::kernel_entry_initializer<type>(
-                               [](const std::string& name) -> gpu_module_base::kernel_handle {
-                                   return nullptr;
-                               }));
+        // ccl_tuple_for_each(main_class::value,
+        //                    detail::kernel_entry_initializer<type>(
+        //                        [](const std::string& name) -> gpu_module_base::kernel_handle {
+        //                            return nullptr;
+        //                        }));
+        for (auto&& kernel : main_class::value) {
+            detail::kernel_entry_initializer<type>(
+                [](const std::string& name) -> gpu_module_base::kernel_handle {
+                    return nullptr;
+                })(kernel.second);
+        }
         LOG_DEBUG("No need to import functions");
     }
 
@@ -123,11 +137,11 @@ struct ipc_gpu_typed_module : private gpu_module_base,
 
 //3) virtual gpu module
 template <ccl_coll_type type,
-          template <typename>
+          //   template <typename>
           class kernel_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_numa_function_impl,
-          template <typename>
+          //   template <typename>
           class kernel_scale_out_cpu_gw_function_impl>
 struct virtual_gpu_typed_module : private gpu_module_base,
                                   public kernel_class<type, kernel_function_impl>,
@@ -151,16 +165,22 @@ struct virtual_gpu_typed_module : private gpu_module_base,
             : gpu_module_base(real_module->get()),
               real_module_ref(real_module) {
         LOG_DEBUG("Virtual gpu module created:", ccl_coll_type_to_str(type));
-        ccl_tuple_for_each(main_class::value,
-                           detail::kernel_entry_initializer<type>(
-                               [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                                   return this->import_kernel(name);
-                               }));
-        ccl_tuple_for_each(numa_class::value,
+        // ccl_tuple_for_each(main_class::value,
+        //                    detail::kernel_entry_initializer<type>(
+        //                        [this](const std::string& name) -> gpu_module_base::kernel_handle {
+        //                            return this->import_kernel(name);
+        //                        }));
+        for (auto&& kernel : main_class::value) {
+            detail::kernel_entry_initializer<type>(
+                [this](const std::string& name) -> gpu_module_base::kernel_handle {
+                    return this->import_kernel(name);
+                })(kernel.second);
+        }
+        /*ccl_tuple_for_each(numa_class::value,
                            detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
-                               }));
+                               }));*/
 
         LOG_DEBUG("Linked functions count: ", functions.size());
     }
diff --git a/src/common/comm/l0/modules/kernel_argument_policies.hpp b/src/common/comm/l0/modules/kernel_argument_policies.hpp
index 70c1a8b85..48b2b0f53 100644
--- a/src/common/comm/l0/modules/kernel_argument_policies.hpp
+++ b/src/common/comm/l0/modules/kernel_argument_policies.hpp
@@ -88,8 +88,8 @@ struct arg_access_policy_atomic {
     std::atomic<bool> charged{ false };
 };
 
-// Policy that invalidates the value once it's loaded by a consumer. It remains invalid for read untill a producer
-// writes an new one
+// Policy that invalidates the value once it's loaded by a consumer.
+// It remains invalid for read untill a producer writes an new one
 // Note: only one read/invalidate is supported
 template <size_t pos, class ArgType, bool must_exist = true>
 struct arg_access_policy_atomic_reset : public arg_access_policy_atomic<pos, ArgType, must_exist> {
diff --git a/src/common/comm/l0/modules/kernel_argument_types.hpp b/src/common/comm/l0/modules/kernel_argument_types.hpp
index ec70d42bb..b852811bc 100644
--- a/src/common/comm/l0/modules/kernel_argument_types.hpp
+++ b/src/common/comm/l0/modules/kernel_argument_types.hpp
@@ -49,7 +49,8 @@ struct kernel_arg : public policy_impl, options {
 template <size_t pos, class type, class options = options::empty>
 using thread_safe_arg = kernel_arg<pos, arg_access_policy_atomic<pos, type, false>, options>;
 
-// thread-safe destructive-copying argument (rechargable): used for concurrent read/write applications, where reader take-away exising value
+// thread-safe destructive-copying argument (rechargeable): used for concurrent
+// read/write applications, where reader take-away existing value
 template <size_t pos, class type, class options = options::empty>
 using thread_exchangable_arg =
     kernel_arg<pos, arg_access_policy_atomic_reset<pos, type, false>, options>;
diff --git a/src/common/comm/l0/modules/kernel_class.hpp b/src/common/comm/l0/modules/kernel_class.hpp
index 92c0ccfa9..669e8ade5 100644
--- a/src/common/comm/l0/modules/kernel_class.hpp
+++ b/src/common/comm/l0/modules/kernel_class.hpp
@@ -15,142 +15,105 @@
 */
 #pragma once
 #include <tuple>
-#include "common/comm/l0/modules/kernel_params.hpp"
 #include "common/utils/tuple.hpp"
+#include <unordered_map>
 
 namespace native {
 
-#define SUPPORTED_KERNEL_NATIVE_DATA_TYPES \
-    int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, ccl::float16, float, \
-        double, ccl::bfloat16
-
-template <ccl_coll_type type, template <typename> class kernel_function_impl>
+template <ccl_coll_type type, class kernel_function_impl, class Enable = void>
 struct kernel_class {
-    template <class native_data_type>
-    using kernel_param_t = kernel_params_default<native_data_type>;
-
-    template <class kernel_param>
-    using kernel_t = kernel_function_impl<kernel_param>;
-
-    template <class... native_data_types>
-    using kernels_t = std::tuple<kernel_t<kernel_param_t<native_data_types>>...>;
-
-    using kernel_class_container_t = kernels_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>;
-
-    // getter
-    template <class kernel_param>
-    const kernel_t<kernel_param> &get() const {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
-    }
-
-    template <class kernel_param>
-    kernel_t<kernel_param> &get() {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
+    using kernel_t = kernel_function_impl;
+
+    using key_type = ccl::datatype;
+
+    struct hasher {
+        size_t operator()(const ccl::datatype& dtype) const {
+            return std::hash<size_t>{}((size_t)dtype);
+        }
+    };
+
+    using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>;
+
+    kernel_class() {
+        for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) {
+            key_type key{ idx };
+            // Have to use this ugly inplace construction because kernel_t have deleted copy and move
+            // constructor and there is no other way to do that.
+            value.emplace(std::piecewise_construct,
+                          std::make_tuple(key),
+                          std::make_tuple(coll_param_gpu(type, idx)));
+        }
     }
-
-protected:
-    kernel_class_container_t value;
-};
-
-template <template <typename> class kernel_function_impl>
-struct kernel_class<ccl_coll_allreduce, kernel_function_impl> {
-    template <class native_data_type, ccl_coll_reduction reduction>
-    using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>;
-
-    template <class kernel_param>
-    using kernel_t = kernel_function_impl<kernel_param>;
-
-    template <class first_param, ccl_coll_reduction... second_params>
-    using kernel_second_params_expanded_t =
-        std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>;
-
-    template <class... first_params>
-    using kernel_first_param_expanded_t = decltype(std::tuple_cat(
-        std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...));
-
-    using kernel_class_container_t =
-        kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>;
-
     // getter
-    template <class kernel_param>
-    const kernel_t<kernel_param> &get() const {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
-    }
+    kernel_t& get(const coll_param_gpu& params) {
+        assert(!params.is_reduction());
+        key_type key{ params.get_datatype() };
 
-    template <class kernel_param>
-    kernel_t<kernel_param> &get() {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
-    }
+        auto it = value.find(key);
+        if (it == value.end()) {
+            // TODO: sycl error
+            throw std::runtime_error("Kernel not found");
+        }
 
-protected:
-    kernel_class_container_t value;
-};
-
-template <template <typename> class kernel_function_impl>
-struct kernel_class<ccl_coll_reduce, kernel_function_impl> {
-    template <class native_data_type, ccl_coll_reduction reduction>
-    using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>;
-
-    template <class kernel_param>
-    using kernel_t = kernel_function_impl<kernel_param>;
-
-    template <class first_param, ccl_coll_reduction... second_params>
-    using kernel_second_params_expanded_t =
-        std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>;
-
-    template <class... first_params>
-    using kernel_first_param_expanded_t = decltype(std::tuple_cat(
-        std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...));
-
-    using kernel_class_container_t =
-        kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>;
-
-    // getter
-    template <class kernel_param>
-    const kernel_t<kernel_param> &get() const {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
-    }
-
-    template <class kernel_param>
-    kernel_t<kernel_param> &get() {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
+        return it->second;
     }
 
 protected:
     kernel_class_container_t value;
 };
 
-template <template <typename> class kernel_function_impl>
-struct kernel_class<ccl_coll_reduce_scatter, kernel_function_impl> {
-    template <class native_data_type, ccl_coll_reduction reduction>
-    using kernel_param_t = kernel_reduction_params_traits<native_data_type, reduction>;
-
-    template <class kernel_param>
-    using kernel_t = kernel_function_impl<kernel_param>;
-
-    template <class first_param, ccl_coll_reduction... second_params>
-    using kernel_second_params_expanded_t =
-        std::tuple<kernel_t<kernel_param_t<first_param, second_params>>...>;
+template <ccl_coll_type type, class kernel_function_impl>
+struct kernel_class<type,
+                    kernel_function_impl,
+                    typename std::enable_if<is_reduction_coll_type<type>::value>::type> {
+    using kernel_t = kernel_function_impl;
+
+    using key_type = std::pair<ccl::datatype, ccl::reduction>;
+
+    struct hasher {
+        size_t operator()(const std::pair<ccl::datatype, ccl::reduction>& key) const {
+            return std::hash<size_t>{}((size_t)key.first) ^ std::hash<size_t>{}((size_t)key.second);
+        }
+    };
+
+    using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>;
+
+    kernel_class() {
+        for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) {
+            // TODO: allow to iterate over reduction values(need to implement operator++)
+            auto insert_kernel = [this, idx](ccl::reduction red) {
+                key_type key{ idx, red };
+                value.emplace(std::piecewise_construct,
+                              std::make_tuple(key),
+                              std::make_tuple(coll_param_gpu(type, idx, red)));
+            };
+
+            insert_kernel(ccl::reduction::sum);
+            insert_kernel(ccl::reduction::prod);
+            insert_kernel(ccl::reduction::min);
+            insert_kernel(ccl::reduction::max);
+        }
+    }
 
-    template <class... first_params>
-    using kernel_first_param_expanded_t = decltype(std::tuple_cat(
-        std::declval<kernel_second_params_expanded_t<first_params, REDUCE_TYPES> &&>()...));
+    // getter
+    kernel_t& get(const coll_param_gpu& params) {
+        assert(params.is_reduction());
 
-    using kernel_class_container_t =
-        kernel_first_param_expanded_t<SUPPORTED_KERNEL_NATIVE_DATA_TYPES>;
+        key_type key{ params.get_datatype(), params.get_reduction() };
 
-    // getter
-    template <class kernel_param>
-    const kernel_t<kernel_param> &get() const {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
-    }
+        auto it = value.find(key);
+        if (it == value.end()) {
+            // TODO: sycl error
+            throw std::runtime_error("Kernel not found");
+        }
 
-    template <class kernel_param>
-    kernel_t<kernel_param> &get() {
-        return ccl_tuple_get<kernel_t<kernel_param>>(value);
+        return it->second;
     }
 
 protected:
+    // TODO: threadsafety? Looks like this should be fine as different threads access different devices.
+    // Need to double check IPC/NUMA case.
     kernel_class_container_t value;
 };
+
 } //namespace native
diff --git a/src/common/comm/l0/modules/kernel_functions.hpp b/src/common/comm/l0/modules/kernel_functions.hpp
index 704438d6d..e3765ce91 100644
--- a/src/common/comm/l0/modules/kernel_functions.hpp
+++ b/src/common/comm/l0/modules/kernel_functions.hpp
@@ -15,6 +15,7 @@
 */
 #pragma once
 #include "common/comm/l0/modules/kernel_argument_types.hpp"
+#include "coll/coll_param.hpp"
 
 namespace native {
 // kernel with its argument collection
@@ -70,21 +71,37 @@ struct kernel_data_storage {
 // major kernel args
 enum main_kernel_args { rank_index = 0, size_index = 1, args_start_index };
 
+class kernel_parameters_holder {
+    coll_param_gpu params;
+
+public:
+    kernel_parameters_holder(const coll_param_gpu& params) : params{ params } {}
+
+    const coll_param_gpu& get_kernel_params() const {
+        return params;
+    }
+};
+
 //main kernel - used for GPU program execution
 template <class Impl, class... arguments>
 struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
                                                      arg<main_kernel_args::size_index, int>,
-                                                     arguments...> {
+                                                     arguments...>,
+                          public kernel_parameters_holder {
     using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
                                      arg<main_kernel_args::size_index, int>,
                                      arguments...>;
     using base::args;
     using base::handle;
 
+    using params_base = kernel_parameters_holder;
+
+    execution_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {}
+
     using rank_type = int;
     using size_type = int;
 
-    static constexpr const char* name() {
+    const char* name() {
         return Impl::specific_name();
     }
 
@@ -179,16 +196,22 @@ struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_
     }
 };
 
-// ipc_kernel - used for GPU data synchronization only
+// base_ipc_kernel - used for GPU data synchronization only
 template <class Impl, class... arguments>
-struct ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
-                                               arg<main_kernel_args::size_index, int>,
-                                               arguments...> {
+struct base_ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
+                                                    arg<main_kernel_args::size_index, int>,
+                                                    arguments...>,
+                         public kernel_parameters_holder {
     using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
                                      arg<main_kernel_args::size_index, int>,
                                      arguments...>;
     using base::args;
     using base::handle;
+
+    using params_base = kernel_parameters_holder;
+
+    base_ipc_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {}
+
     static constexpr const char* name() {
         return Impl::specific_name();
     }
diff --git a/src/common/comm/l0/modules/kernel_utils.cpp b/src/common/comm/l0/modules/kernel_utils.cpp
new file mode 100644
index 000000000..ce1f4c5ae
--- /dev/null
+++ b/src/common/comm/l0/modules/kernel_utils.cpp
@@ -0,0 +1,53 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/comm/l0/modules/kernel_utils.hpp"
+#include "common/global/global.hpp"
+
+namespace native {
+namespace detail {
+
+std::string to_string(ccl::reduction red) {
+#define P(val) \
+    case ccl::reduction::val: return #val;
+
+    switch (red) {
+        P(sum);
+        P(prod);
+        P(min);
+        P(max);
+        default:
+            throw std::runtime_error("Unexpected value of reduction: " +
+                                     std::to_string(static_cast<int>(red)));
+    }
+
+#undef P
+}
+
+// TODO: ideally we should take a set of all parameters and generate a kernel name
+// to execute
+std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params) {
+    // TODO: introduce a simple function to map names?
+    // Can we remove dtypes from global_data then? Do we need custom datatypes?
+    auto name = kernel_name + "_" + ccl::global_data::get().dtypes->name(params.get_datatype());
+    if (params.is_reduction()) {
+        name += "_" + to_string(params.get_reduction());
+    }
+
+    return name;
+}
+
+} // namespace detail
+} // namespace native
diff --git a/src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp b/src/common/comm/l0/modules/kernel_utils.hpp
similarity index 67%
rename from src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp
rename to src/common/comm/l0/modules/kernel_utils.hpp
index f93db94c3..fc4b82804 100644
--- a/src/common/comm/l0/context/scaling_ctx/observer_session_key.cpp
+++ b/src/common/comm/l0/modules/kernel_utils.hpp
@@ -13,18 +13,16 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
+#pragma once
+
+#include <string>
+
+#include "coll/coll_param.hpp"
 
 namespace native {
-namespace observer {
+namespace detail {
 
-bool session_key::operator<(const session_key& other) const noexcept {
-    return hash < other.hash;
-}
+std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params);
 
-std::string session_key::to_string() const {
-    return std::to_string(hash);
 }
-
-} // namespace observer
 } // namespace native
diff --git a/src/common/comm/l0/modules/modules_utils.hpp b/src/common/comm/l0/modules/modules_utils.hpp
index 1bcbd4d70..29d507c2c 100644
--- a/src/common/comm/l0/modules/modules_utils.hpp
+++ b/src/common/comm/l0/modules/modules_utils.hpp
@@ -17,35 +17,13 @@
 
 #include "common/comm/l0/modules/base_entry_module.hpp"
 #include "common/utils/tuple.hpp"
+#include "common/comm/l0/modules/kernel_utils.hpp"
 
 namespace native {
 namespace detail {
 
-template <ccl_coll_type type, typename = void>
-struct kernel_entry_initializer {
-    using loader_t =
-        std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
-
-    kernel_entry_initializer(loader_t&& f) : functor(std::move(f)) {}
-
-    template <class typed_kernel>
-    void operator()(typed_kernel& kernel) {
-        kernel.handle =
-            functor(std::string(typed_kernel::name()) + "_" +
-                    ccl::native_type_info<typename typed_kernel::processing_type>::name());
-    }
-
-private:
-    loader_t functor;
-};
-
-// Make template specialization for those collective types,
-// which have a multiply reduction ability
 template <ccl_coll_type type>
-struct kernel_entry_initializer<
-    type,
-    typename std::enable_if<type == ccl_coll_allreduce || type == ccl_coll_reduce ||
-                            type == ccl_coll_reduce_scatter>::type> {
+struct kernel_entry_initializer {
     using loader_t =
         std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
 
@@ -53,10 +31,7 @@ struct kernel_entry_initializer<
 
     template <class typed_kernel>
     void operator()(typed_kernel& kernel) {
-        kernel.handle =
-            functor(std::string(typed_kernel::name()) + "_" +
-                    ccl::native_type_info<typename typed_kernel::processing_type>::name() + "_" +
-                    reduction_to_str(typed_kernel::param_t::red_type));
+        kernel.handle = functor(get_kernel_name(kernel.name(), kernel.get_kernel_params()));
     }
 
 private:
diff --git a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp b/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp
index acb21f613..24b00a8c5 100644
--- a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_allgatherv,
                                  ccl::device_topology_type::ring,
-                                 ring_allgatherv_kernel,
-                                 ring_allgatherv_numa_kernel,
-                                 ring_allgatherv_scale_out_cpu_gw_kernel);
+                                 ring::allgatherv::main_kernel,
+                                 ring::allgatherv::numa_kernel,
+                                 ring::allgatherv::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_allgatherv,
                                  ccl::device_topology_type::ring,
-                                 ring_allgatherv_ipc,
-                                 ring_allgatherv_ipc,
-                                 ring_allgatherv_ipc);
+                                 ring::allgatherv::ipc_kernel,
+                                 ring::allgatherv::ipc_kernel,
+                                 ring::allgatherv::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allgatherv,
                                 ccl::device_topology_type::ring,
-                                ring_allgatherv_kernel,
-                                ring_allgatherv_numa_kernel,
-                                ring_allgatherv_scale_out_cpu_gw_kernel);
+                                ring::allgatherv::main_kernel,
+                                ring::allgatherv::numa_kernel,
+                                ring::allgatherv::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp b/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp
index a3fd720ba..23115a5f9 100644
--- a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp
@@ -17,347 +17,225 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_allgatherv_kernel
-        : public execution_kernel<
-              ring_allgatherv_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>, // elems_count
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 3,
-                  typename kernel_params::native_type*>, // send_buf
-              thread_exchangable_arg<main_kernel_args::args_start_index + 4,
-                                     typename kernel_params::native_type*>, // recv_buf
-              arg<main_kernel_args::args_start_index + 5,
-                  typename kernel_params::native_type*>, // right_output_buffer
-              external_arg<main_kernel_args::args_start_index + 6,
-                           int*>, // left_wrote_to_me_flag
-              external_arg<main_kernel_args::args_start_index + 7,
-                           int*>, // i_ready_to_receive_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 8,
-                                     int*>, // i_send_to_right_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 9,
-                                     int*>> // right_ready_to_recv_flag
-{
-    using processing_type = typename kernel_params::native_type;
 
-    static constexpr const char* specific_name() {
-        return "allgatherv_execution";
-    }
+namespace ring {
 
-    // elems_count
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+namespace allgatherv {
+
+/**
+ * Common args for all kernel types
+ */
+
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
+using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+
+using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
+using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
 
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
+using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
+using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
 
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 3, native_t*>;
 
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
+template <class native_t>
+using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 4, native_t*>;
 
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
+template <class native_t>
+using right_output_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 5, native_t*>;
 
-    // right_output_buffer
-    using right_output_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_output_buf_arg_type = typename right_output_buf_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 6, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 6, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 7, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 7, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
 
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
 
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel
+        : public execution_kernel<main_kernel,
+                                  send_buf_size_arg, // elems_count
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
+{
+    using processing_type = void;
+
+    static constexpr const char* specific_name() {
+        return "allgatherv_execution";
+    }
+
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
-    using base = execution_kernel<ring_allgatherv_kernel<kernel_params>,
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg,
                                   recv_elem_counts_buf_arg,
                                   recv_elem_offsets_buf_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  right_output_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  right_output_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    using base::base;
 };
 
 // IMPORTANT: the params order is default, see *algatherv*.cl for that
-template <class kernel_params>
-struct ring_allgatherv_numa_kernel
-        : public execution_kernel<
-              ring_allgatherv_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>, // elems_count
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 3,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 5,
-                              typename kernel_params::native_type*>, // right_output_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>> // right_ready_to_recv_flag>
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  send_buf_size_arg, // elems_count
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
 {
-    using processing_type = typename kernel_params::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allgatherv_execution_numa";
     }
 
-    // elems_count
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // right_output_buffer
-    using right_output_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_output_buf_arg_type = typename right_output_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_allgatherv_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg,
                                   recv_elem_counts_buf_arg,
                                   recv_elem_offsets_buf_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  right_output_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  right_output_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_allgatherv_ipc
-        : public ipc_kernel<
-              ring_allgatherv_ipc<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>, // elems_count
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 3,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 5,
-                              typename kernel_params::native_type*>, // right_output_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>> // right_ready_to_recv_flag
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           send_buf_size_arg, // elems_count
+                                           recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                           recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                           send_buf_arg<void>, // send_buf
+                                           recv_buf_arg<void>, // recv_buf (output_buffer)
+                                           right_output_buf_arg<void>, // right_output_buffer
+                                           income_data_flag_arg, // left_wrote_to_me_flag
+                                           ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                           right_income_data_flag_arg, // i_send_to_right_flag
+                                           right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
 {
-    using processing_type = typename kernel_params::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "ring_allgatherv_ipc";
     }
 
-    // elems_count
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // right_output_buffer
-    using right_output_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_output_buf_arg_type = typename right_output_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 send_buf_size_arg,
+                                 recv_elem_counts_buf_arg,
+                                 recv_elem_offsets_buf_arg,
+                                 send_buf_arg<processing_type>,
+                                 recv_buf_arg<processing_type>,
+                                 right_output_buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 right_income_data_flag_arg,
+                                 right_ready_to_recv_flag_arg>;
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<recv_buf_arg<processing_type>>(recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+    }
 
-    using base = execution_kernel<ring_allgatherv_ipc<kernel_params>,
-                                  send_buf_size_arg,
-                                  recv_elem_counts_buf_arg,
-                                  recv_elem_offsets_buf_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  right_output_buf_arg,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_allgatherv_scale_out_cpu_gw_kernel
-        : public execution_kernel<
-              ring_allgatherv_scale_out_cpu_gw_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>, // elems_count
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 3,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 5,
-                              typename kernel_params::native_type*>, // right_output_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>> // right_ready_to_recv_flag>
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  send_buf_size_arg, // elems_count
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
 {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allgatherv_execution_scale_out_cpu_gw";
     }
 
-    // elems_count
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // right_output_buffer
-    using right_output_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_output_buf_arg_type = typename right_output_buf_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 6, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 7, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_allgatherv_scale_out_cpu_gw_kernel<param_t>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg,
                                   recv_elem_counts_buf_arg,
                                   recv_elem_offsets_buf_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  right_output_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  right_output_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
+} // namespace allgatherv
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp b/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp
index 62bf7e2fd..9eff3d8e5 100644
--- a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_allreduce,
                                  ccl::device_topology_type::ring,
-                                 ring_allreduce_kernel,
-                                 ring_allreduce_numa_kernel,
-                                 ring_allreduce_scale_out_cpu_gw_kernel);
+                                 ring::allreduce::main_kernel,
+                                 ring::allreduce::numa_kernel,
+                                 ring::allreduce::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_allreduce,
                                  ccl::device_topology_type::ring,
-                                 ring_allreduce_ipc,
-                                 ring_allreduce_ipc,
-                                 ring_allreduce_ipc);
+                                 ring::allreduce::ipc_kernel,
+                                 ring::allreduce::ipc_kernel,
+                                 ring::allreduce::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce,
                                 ccl::device_topology_type::ring,
-                                ring_allreduce_kernel,
-                                ring_allreduce_numa_kernel,
-                                ring_allreduce_scale_out_cpu_gw_kernel);
+                                ring::allreduce::main_kernel,
+                                ring::allreduce::numa_kernel,
+                                ring::allreduce::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
index 8e8425251..06152f64b 100644
--- a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
@@ -17,157 +17,115 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_allreduce_kernel
-        : public execution_kernel<
-              ring_allreduce_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              external_arg<main_kernel_args::args_start_index + 3,
-                           typename kernel_params::native_type*>,
-              external_arg<main_kernel_args::args_start_index + 4, int*>,
-              external_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 7,
-                                     typename kernel_params::native_type*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
 
-    static constexpr const char* specific_name() {
-        return "allreduce_execution";
-    }
+namespace ring {
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+namespace allreduce {
 
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
+/**
+ * Common args for all kernel types
+ */
 
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
+// own
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
+using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
 
-    using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
 
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+template <class native_t>
+using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>;
 
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+template <class native_t>
+using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
 
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
+using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+
+// right
+template <class native_t>
+using right_tmp_recv_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
+
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
+
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
+
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel : public execution_kernel<main_kernel,
+                                             send_buf_size_arg,
+                                             send_buf_arg<void>,
+                                             recv_buf_arg<void>,
+                                             tmp_recv_buf_arg<void>,
+                                             income_data_flag_arg,
+                                             ready_to_recv_flag_arg,
+                                             local_barrier_flag_arg,
+                                             right_tmp_recv_buf_arg<void>,
+                                             right_income_data_flag_arg,
+                                             right_ready_to_recv_flag_arg> {
+    using processing_type = void;
+
+    static constexpr const char* specific_name() {
+        return "allreduce_execution";
+    }
 
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
-    using base = execution_kernel<ring_allreduce_kernel<kernel_params>,
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<void>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_allreduce_numa_kernel
-        : public execution_kernel<
-              ring_allreduce_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-
-              // numa-specific args
-              permanent_arg<main_kernel_args::args_start_index + 10,
-                            typename kernel_params::native_type*>,
-              permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>,
-              permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>,
-              permanent_arg<main_kernel_args::args_start_index + 13,
-                            typename kernel_params::native_type*>,
-              permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct numa_kernel : public execution_kernel<
+                         numa_kernel,
+                         send_buf_size_arg,
+                         send_buf_arg<void>,
+                         recv_buf_arg<void>,
+                         tmp_recv_buf_arg<void>,
+                         income_data_flag_arg,
+                         ready_to_recv_flag_arg,
+                         local_barrier_flag_arg,
+                         right_tmp_recv_buf_arg<void>,
+                         right_income_data_flag_arg,
+                         right_ready_to_recv_flag_arg,
+
+                         // numa-specific args
+                         permanent_arg<main_kernel_args::args_start_index + 10, void*>,
+                         permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>,
+                         permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>,
+                         permanent_arg<main_kernel_args::args_start_index + 13, void*>,
+                         permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allreduce_execution_numa";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 10, void*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
     using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>;
@@ -178,22 +136,22 @@ struct ring_allreduce_numa_kernel
     using event_consumed_bytes_offset_arg_type = typename event_consumed_bytes_offset_arg::arg_type;
 
     using event_consumed_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 13, processing_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 13, void*>;
     using event_consumed_chunk_mem_arg_type = typename event_consumed_chunk_mem_arg::arg_type;
 
     using event_consumed_bytes_arg =
         permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>;
     using event_consumed_bytes_arg_type = typename event_consumed_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_allreduce_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<void>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
@@ -201,128 +159,105 @@ struct ring_allreduce_numa_kernel
                                   event_consumed_bytes_offset_arg,
                                   event_consumed_chunk_mem_arg,
                                   event_consumed_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        this->template set_arg<event_prod_chunk_mem_arg>(
+            static_cast<void*>(out_ctx_params.host_mem_producer->get()));
+        this->template set_arg<event_prod_bytes_arg>(
+            out_ctx_params.host_mem_producer_counter->get());
+        this->template set_arg<event_consumed_bytes_offset_arg>(
+            out_ctx_params.producer_aggregated_memory_offset->get());
+        this->template set_arg<event_consumed_chunk_mem_arg>(
+            static_cast<void*>(out_ctx_params.dev_mem_consumer->get()));
+        this->template set_arg<event_consumed_bytes_arg>(
+            out_ctx_params.dev_mem_consumer_counter->get());
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_allreduce_ipc
-        : public ipc_kernel<ring_allreduce_ipc<kernel_params>,
-                            stub_arg<main_kernel_args::args_start_index>,
-                            stub_arg<main_kernel_args::args_start_index + 1>,
-                            stub_arg<main_kernel_args::args_start_index + 2>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 3,
-                                            typename kernel_params::native_type*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-                            stub_arg<main_kernel_args::args_start_index + 6>,
-                            stub_arg<main_kernel_args::args_start_index + 7>,
-                            stub_arg<main_kernel_args::args_start_index + 8>,
-                            stub_arg<main_kernel_args::args_start_index + 9>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           stub_arg<main_kernel_args::args_start_index>,
+                                           stub_arg<main_kernel_args::args_start_index + 1>,
+                                           stub_arg<main_kernel_args::args_start_index + 2>,
+                                           tmp_recv_buf_arg<void>,
+                                           income_data_flag_arg,
+                                           ready_to_recv_flag_arg,
+                                           stub_arg<main_kernel_args::args_start_index + 6>,
+                                           stub_arg<main_kernel_args::args_start_index + 7>,
+                                           stub_arg<main_kernel_args::args_start_index + 8>,
+                                           stub_arg<main_kernel_args::args_start_index + 9>> {
+    using processing_type = void;
+
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     static constexpr const char* specific_name() {
         return "ring_allreduce_ipc";
     }
 
-    using tmp_recv_buf_arg = typename ring_allreduce_kernel<kernel_params>::tmp_recv_buf_arg;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg =
-        typename ring_allreduce_kernel<kernel_params>::income_data_flag_arg;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg =
-        typename ring_allreduce_kernel<kernel_params>::ready_to_recv_flag_arg;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+    using base = base_ipc_kernel<ipc_kernel,
+                                 stub_arg<main_kernel_args::args_start_index>,
+                                 stub_arg<main_kernel_args::args_start_index + 1>,
+                                 stub_arg<main_kernel_args::args_start_index + 2>,
+                                 tmp_recv_buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 stub_arg<main_kernel_args::args_start_index + 6>,
+                                 stub_arg<main_kernel_args::args_start_index + 7>,
+                                 stub_arg<main_kernel_args::args_start_index + 8>,
+                                 stub_arg<main_kernel_args::args_start_index + 9>>;
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+    }
 
-    using base = execution_kernel<ring_allreduce_ipc<kernel_params>,
-                                  stub_arg<main_kernel_args::args_start_index>,
-                                  stub_arg<main_kernel_args::args_start_index + 1>,
-                                  stub_arg<main_kernel_args::args_start_index + 2>,
-                                  tmp_recv_buf_arg,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  stub_arg<main_kernel_args::args_start_index + 6>,
-                                  stub_arg<main_kernel_args::args_start_index + 7>,
-                                  stub_arg<main_kernel_args::args_start_index + 8>,
-                                  stub_arg<main_kernel_args::args_start_index + 9>>;
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_allreduce_scale_out_cpu_gw_kernel
+struct scale_out_cpu_gw_kernel
         : public execution_kernel<
-              ring_allreduce_scale_out_cpu_gw_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
+              scale_out_cpu_gw_kernel,
+              send_buf_size_arg,
+              send_buf_arg<void>,
+              recv_buf_arg<void>,
+              tmp_recv_buf_arg<void>,
+              income_data_flag_arg,
+              ready_to_recv_flag_arg,
+              local_barrier_flag_arg,
+              right_tmp_recv_buf_arg<void>,
+              right_income_data_flag_arg,
+              right_ready_to_recv_flag_arg,
 
               // scaleout-specific args
-              permanent_arg<main_kernel_args::args_start_index + 10,
-                            typename kernel_params::native_type*>,
+              permanent_arg<main_kernel_args::args_start_index + 10, void*>,
               permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>,
               permanent_arg<main_kernel_args::args_start_index + 12, uint64_t*>,
-              permanent_arg<main_kernel_args::args_start_index + 13,
-                            typename kernel_params::native_type*>,
+              permanent_arg<main_kernel_args::args_start_index + 13, void*>,
               permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>> {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "allreduce_execution_scale_out_cpu_gw";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 10, void*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
     using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, uint64_t*>;
@@ -333,22 +268,22 @@ struct ring_allreduce_scale_out_cpu_gw_kernel
     using event_consumed_bytes_offset_arg_type = typename event_consumed_bytes_offset_arg::arg_type;
 
     using event_consumed_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 13, processing_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 13, void*>;
     using event_consumed_chunk_mem_arg_type = typename event_consumed_chunk_mem_arg::arg_type;
 
     using event_consumed_bytes_arg =
         permanent_arg<main_kernel_args::args_start_index + 14, uint64_t*>;
     using event_consumed_bytes_arg_type = typename event_consumed_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_allreduce_scale_out_cpu_gw_kernel<kernel_params>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
@@ -356,5 +291,24 @@ struct ring_allreduce_scale_out_cpu_gw_kernel
                                   event_consumed_bytes_offset_arg,
                                   event_consumed_chunk_mem_arg,
                                   event_consumed_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        this->template set_arg<event_prod_chunk_mem_arg>(
+            static_cast<void*>(out_ctx_params.host_mem_producer->get()));
+        this->template set_arg<event_prod_bytes_arg>(
+            out_ctx_params.host_mem_producer_counter->get());
+        this->template set_arg<event_consumed_bytes_offset_arg>(
+            out_ctx_params.producer_aggregated_memory_offset->get());
+        this->template set_arg<event_consumed_chunk_mem_arg>(
+            static_cast<void*>(out_ctx_params.dev_mem_consumer->get()));
+        this->template set_arg<event_consumed_bytes_arg>(
+            out_ctx_params.dev_mem_consumer_counter->get());
+    }
+
+    using base::base;
 };
+
+} // namespace allreduce
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp b/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp
index a412ed86b..e03917339 100644
--- a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_alltoallv,
                                  ccl::device_topology_type::ring,
-                                 ring_alltoallv_kernel,
-                                 ring_alltoallv_numa_kernel,
-                                 ring_alltoallv_scale_out_cpu_gw_kernel);
+                                 ring::alltoallv::main_kernel,
+                                 ring::alltoallv::numa_kernel,
+                                 ring::alltoallv::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_alltoallv,
                                  ccl::device_topology_type::ring,
-                                 ring_alltoallv_ipc,
-                                 ring_alltoallv_ipc,
-                                 ring_alltoallv_ipc);
+                                 ring::alltoallv::ipc_kernel,
+                                 ring::alltoallv::ipc_kernel,
+                                 ring::alltoallv::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_alltoallv,
                                 ccl::device_topology_type::ring,
-                                ring_alltoallv_kernel,
-                                ring_alltoallv_numa_kernel,
-                                ring_alltoallv_scale_out_cpu_gw_kernel);
+                                ring::alltoallv::main_kernel,
+                                ring::alltoallv::numa_kernel,
+                                ring::alltoallv::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp b/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp
index 62cb09550..fe71d51a3 100644
--- a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp
@@ -17,461 +17,273 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_alltoallv_kernel
-        : public execution_kernel<
-              ring_alltoallv_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 5,
-                  typename kernel_params::native_type*>, // recv_buf
-              external_arg<main_kernel_args::args_start_index + 6,
-                           typename kernel_params::native_type*>, // tmp_buffer
-              thread_exchangable_arg<main_kernel_args::args_start_index + 7,
-                                     typename kernel_params::native_type*>, // right_temp_buffer
-              external_arg<main_kernel_args::args_start_index + 8,
-                           int*>, // left_wrote_to_me_flag
-              external_arg<main_kernel_args::args_start_index + 9,
-                           int*>, // i_ready_to_receive_flag
-              external_arg<main_kernel_args::args_start_index + 10,
-                           int*>, // proxy_size_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 11,
-                                     int*>, // i_send_to_right_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 12,
-                                     int*>, // right_ready_to_recv_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 13,
-                                     int*>> // right_proxy_size_flag
+
+namespace ring {
+
+namespace alltoallv {
+
+/**
+ * Common args for all kernel types
+ */
+
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
+using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+
+using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
+using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
+
+using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
+using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
+
+using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
+using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
+
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 4, native_t*>;
+
+template <class native_t>
+using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, native_t*>;
+
+template <class native_t>
+using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 6, native_t*>;
+
+template <class native_t>
+using right_tmp_recv_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
+
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 8, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 9, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+
+using proxy_size_flag_arg = external_arg<main_kernel_args::args_start_index + 10, int*>;
+using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
+
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 11, int*>;
+
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 12, int*>;
+
+using right_proxy_size_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 13, int*>;
+
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel
+        : public execution_kernel<main_kernel,
+                                  send_buf_size_arg, // send_elem_counts
+                                  send_elem_offsets_buf_arg, // send_elem_offsets
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf
+                                  tmp_recv_buf_arg<void>, // tmp_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  proxy_size_flag_arg, // proxy_size_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+                                  right_proxy_size_flag_arg> // right_proxy_size_flag
 {
-    using processing_type = typename kernel_params::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "alltoallv_execution";
     }
 
-    // send_elem_counts
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // send_elem_offsets
-    using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // tmp_buffer
-    using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 6, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    // right_temp_buffer
-    using right_tmp_recv_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 8, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 9, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // proxy_size_flag
-    using proxy_size_flag_arg = external_arg<main_kernel_args::args_start_index + 10, int*>;
-    using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 11, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 12, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    // right_proxy_size_flag
-    using right_proxy_size_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 13, int*>;
-    using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_alltoallv_kernel<kernel_params>,
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg, // 0 send_elem_counts
                                   send_elem_offsets_buf_arg, // 1 send_elem_offsets
                                   recv_elem_counts_buf_arg, // 2 recv_elem_counts
                                   recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg, // 4 send_buf_arg
-                                  recv_buf_arg, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg, // 7 right_temp_buffer
+                                  send_buf_arg<processing_type>, // 4 send_buf_arg
+                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
+                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
+                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
                                   income_data_flag_arg, // 8 left_wrote_to_me_flag
                                   ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
                                   proxy_size_flag_arg, // 10 proxy_size_flag_arg
                                   right_income_data_flag_arg, // 11 i_send_to_right_flag
                                   right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
                                   right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
+
+    using base::base;
 };
 
 // IMPORTANT: the params order is default, see *altoallv*.cl for that
-template <class kernel_params>
-struct ring_alltoallv_numa_kernel
-        : public execution_kernel<
-              ring_alltoallv_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 5,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              typename kernel_params::native_type*>, // tmp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>, // right_temp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 8,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 12,
-                              int*>, // right_ready_to_recv_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 13,
-                              int*>> // right_proxy_size_flag
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  send_buf_size_arg, // send_elem_counts
+                                  send_elem_offsets_buf_arg, // send_elem_offsets
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf
+                                  tmp_recv_buf_arg<void>, // tmp_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  proxy_size_flag_arg, // proxy_size_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+                                  right_proxy_size_flag_arg> // right_proxy_size_flag
 {
-    using processing_type = typename kernel_params::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "alltoallv_execution_numa";
     }
 
-    // send_elem_counts
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // send_elem_offsets
-    using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // tmp_buffer
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    // right_temp_buffer
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // proxy_size_flag
-    using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>;
-    using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 12, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    // right_proxy_size_flag
-    using right_proxy_size_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 13, int*>;
-    using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_alltoallv_numa_kernel<kernel_params>,
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg, // 0 send_elem_counts
                                   send_elem_offsets_buf_arg, // 1 send_elem_offsets
                                   recv_elem_counts_buf_arg, // 2 recv_elem_counts
                                   recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg, // 4 send_buf_arg
-                                  recv_buf_arg, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg, // 7 right_temp_buffer
+                                  send_buf_arg<processing_type>, // 4 send_buf_arg
+                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
+                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
+                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
                                   income_data_flag_arg, // 8 left_wrote_to_me_flag
                                   ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
                                   proxy_size_flag_arg, // 10 proxy_size_flag_arg
                                   right_income_data_flag_arg, // 11 i_send_to_right_flag
                                   right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
                                   right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_alltoallv_ipc
-        : public ipc_kernel<
-              ring_alltoallv_ipc<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 5,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              typename kernel_params::native_type*>, // tmp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>, // right_temp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 8,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 12,
-                              int*>, // right_ready_to_recv_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 13,
-                              int*>> // right_proxy_size_flag
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           send_buf_size_arg, // send_elem_counts
+                                           send_elem_offsets_buf_arg, // send_elem_offsets
+                                           recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                           recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                           send_buf_arg<void>, // send_buf
+                                           recv_buf_arg<void>, // recv_buf
+                                           tmp_recv_buf_arg<void>, // tmp_buffer
+                                           right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                           income_data_flag_arg, // left_wrote_to_me_flag
+                                           ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                           proxy_size_flag_arg, // proxy_size_flag
+                                           right_income_data_flag_arg, // i_send_to_right_flag
+                                           right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+                                           right_proxy_size_flag_arg> // right_proxy_size_flag
 {
-    using processing_type = typename kernel_params::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "ring_alltoallv_ipc";
     }
 
-    // send_elem_counts
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // send_elem_offsets
-    using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // tmp_buffer
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    // right_temp_buffer
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // proxy_size_flag
-    using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>;
-    using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 12, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    // right_proxy_size_flag
-    using right_proxy_size_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 13, int*>;
-    using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_alltoallv_ipc<kernel_params>,
-                                  send_buf_size_arg, // 0 send_elem_counts
-                                  send_elem_offsets_buf_arg, // 1 send_elem_offsets
-                                  recv_elem_counts_buf_arg, // 2 recv_elem_counts
-                                  recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg, // 4 send_buf_arg
-                                  recv_buf_arg, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg, // 7 right_temp_buffer
-                                  income_data_flag_arg, // 8 left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // 10 proxy_size_flag_arg
-                                  right_income_data_flag_arg, // 11 i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 send_buf_size_arg, // 0 send_elem_counts
+                                 send_elem_offsets_buf_arg, // 1 send_elem_offsets
+                                 recv_elem_counts_buf_arg, // 2 recv_elem_counts
+                                 recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
+                                 send_buf_arg<processing_type>, // 4 send_buf_arg
+                                 recv_buf_arg<processing_type>, // 5 recv_buf_arg
+                                 tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
+                                 right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
+                                 income_data_flag_arg, // 8 left_wrote_to_me_flag
+                                 ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
+                                 proxy_size_flag_arg, // 10 proxy_size_flag_arg
+                                 right_income_data_flag_arg, // 11 i_send_to_right_flag
+                                 right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
+                                 right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+
+        auto proxy_size_flag =
+            reinterpret_cast<proxy_size_flag_arg_type>(ipc_handles.at(3).get().pointer);
+        this->template set_arg<proxy_size_flag_arg>(proxy_size_flag);
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_alltoallv_scale_out_cpu_gw_kernel
-        : public execution_kernel<
-              ring_alltoallv_scale_out_cpu_gw_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t*>, // send_elem_counts
-              arg<main_kernel_args::args_start_index + 1, size_t*>, // send_elem_offsets
-              arg<main_kernel_args::args_start_index + 2, size_t*>, // recv_elem_counts_buf
-              arg<main_kernel_args::args_start_index + 3, size_t*>, // recv_elem_offsets_buf
-              arg<main_kernel_args::args_start_index + 4,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 5,
-                  typename kernel_params::native_type*>, // recv_buf
-              thread_safe_arg<main_kernel_args::args_start_index + 6,
-                              typename kernel_params::native_type*>, // tmp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>, // right_temp_buffer
-              thread_safe_arg<main_kernel_args::args_start_index + 8,
-                              int*>, // left_wrote_to_me_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 9,
-                              int*>, // i_ready_to_receive_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 10, int*>, // proxy_size_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              int*>, // i_send_to_right_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 12,
-                              int*>, // right_ready_to_recv_flag
-              thread_safe_arg<main_kernel_args::args_start_index + 13,
-                              int*>> // right_proxy_size_flag
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  send_buf_size_arg, // send_elem_counts
+                                  send_elem_offsets_buf_arg, // send_elem_offsets
+                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
+                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf
+                                  tmp_recv_buf_arg<void>, // tmp_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  proxy_size_flag_arg, // proxy_size_flag
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+                                  right_proxy_size_flag_arg> // right_proxy_size_flag
 {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "alltoallv_execution_scale_out_cpu_gw";
     }
 
-    // send_elem_counts
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
     using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    // send_elem_offsets
-    using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-    using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
-
-    // recv_elem_counts_buf
-    using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-    using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-    // recv_elem_offsets_buf
-    using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
-    using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-    // send_buf
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 4, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    // recv_buf
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    // tmp_buffer
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 6, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    // right_temp_buffer
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    // left_wrote_to_me_flag
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    // i_ready_to_receive_flag
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    // proxy_size_flag
-    using proxy_size_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>;
-    using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
-
-    // i_send_to_right_flag
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    // right_ready_to_recv_flag
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 12, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    // right_proxy_size_flag
-    using right_proxy_size_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 13, int*>;
-    using right_proxy_size_flag_type = typename right_proxy_size_flag_arg::arg_type;
-
-    using base = execution_kernel<ring_alltoallv_scale_out_cpu_gw_kernel<kernel_params>,
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg, // 0 send_elem_counts
                                   send_elem_offsets_buf_arg, // 1 send_elem_offsets
                                   recv_elem_counts_buf_arg, // 2 recv_elem_counts
                                   recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg, // 4 send_buf_arg
-                                  recv_buf_arg, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg, // 7 right_temp_buffer
+                                  send_buf_arg<processing_type>, // 4 send_buf_arg
+                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
+                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
+                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
                                   income_data_flag_arg, // 8 left_wrote_to_me_flag
                                   ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
                                   proxy_size_flag_arg, // 10 proxy_size_flag_arg
                                   right_income_data_flag_arg, // 11 i_send_to_right_flag
                                   right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
                                   right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
+
+} // namespace alltoallv
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp b/src/common/comm/l0/modules/ring/bcast_entry_module.hpp
index c87003dda..c308d25db 100644
--- a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/bcast_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_bcast,
                                  ccl::device_topology_type::ring,
-                                 ring_bcast_kernel,
-                                 ring_bcast_numa_kernel,
-                                 ring_bcast_scale_out_cpu_gw_kernel);
+                                 ring::bcast::main_kernel,
+                                 ring::bcast::numa_kernel,
+                                 ring::bcast::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_bcast,
                                  ccl::device_topology_type::ring,
-                                 ring_bcast_ipc,
-                                 ring_bcast_ipc,
-                                 ring_bcast_ipc);
+                                 ring::bcast::ipc_kernel,
+                                 ring::bcast::ipc_kernel,
+                                 ring::bcast::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_bcast,
                                 ccl::device_topology_type::ring,
-                                ring_bcast_kernel,
-                                ring_bcast_numa_kernel,
-                                ring_bcast_scale_out_cpu_gw_kernel);
+                                ring::bcast::main_kernel,
+                                ring::bcast::numa_kernel,
+                                ring::bcast::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp b/src/common/comm/l0/modules/ring/bcast_export_functions.hpp
index a2f157c86..1d9a610ac 100644
--- a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/bcast_export_functions.hpp
@@ -17,263 +17,233 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_bcast_kernel : public execution_kernel<
-                               ring_bcast_kernel<kernel_params>,
-                               arg<main_kernel_args::args_start_index, size_t>,
-                               thread_exchangable_arg<main_kernel_args::args_start_index + 1,
-                                                      typename kernel_params::native_type*>,
-                               external_arg<main_kernel_args::args_start_index + 2, int*>,
-                               external_arg<main_kernel_args::args_start_index + 3, int*>,
-                               arg<main_kernel_args::args_start_index + 4, int*>,
-                               thread_exchangable_arg<main_kernel_args::args_start_index + 5,
-                                                      typename kernel_params::native_type*>,
-                               thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>,
-                               thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>,
-                               arg<main_kernel_args::args_start_index + 8, size_t>> {
-    using processing_type = typename kernel_params::native_type;
 
-    static constexpr const char* specific_name() {
-        return "bcast_execution";
-    }
+namespace ring {
 
-    //own
-    using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using common_entry_buf_size_arg = buf_size_arg;
-    using buf_size_arg_type = typename buf_size_arg::arg_type;
+namespace bcast {
+
+/**
+ * Common args for all kernel types
+ */
+
+using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
+using buf_size_arg_type = typename buf_size_arg::arg_type;
+
+template <class native_t>
+using buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 1, native_t*>;
 
-    using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = buf_arg;
-    using buf_arg_type = typename buf_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 2, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 2, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 3, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 3, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>;
+using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
 
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+template <class native_t>
+using right_buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 5, native_t*>;
 
-    //right
-    using right_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_buf_arg_type = typename right_buf_arg::arg_type;
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>;
+using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
 
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 6, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>;
+using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
 
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 7, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>;
+using root_arg_type = typename root_arg::arg_type;
 
-    using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel : public execution_kernel<main_kernel,
+                                             buf_size_arg,
+                                             buf_arg<void>,
+                                             income_data_flag_arg,
+                                             ready_to_recv_flag_arg,
+                                             local_barrier_flag_arg,
+                                             right_buf_arg<void>,
+                                             right_income_data_flag_arg,
+                                             right_ready_to_recv_flag_arg,
+                                             root_arg> {
+    using processing_type = void;
 
-    using base = execution_kernel<ring_bcast_kernel<kernel_params>,
+    static constexpr const char* specific_name() {
+        return "bcast_execution";
+    }
+
+    using common_entry_buf_size_arg = buf_size_arg;
+    using common_entry_buf_arg = buf_arg<processing_type>;
+
+    using base = execution_kernel<main_kernel,
                                   buf_size_arg,
-                                  buf_arg,
+                                  buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_buf_arg,
+                                  right_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_bcast_numa_kernel
-        : public execution_kernel<ring_bcast_numa_kernel<kernel_params>,
-                                  arg<main_kernel_args::args_start_index, size_t>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 1,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 2, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 3, int*>,
-                                  arg<main_kernel_args::args_start_index + 4, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 5,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 6, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 7, int*>,
-                                  arg<main_kernel_args::args_start_index + 8, size_t>,
-
-                                  thread_safe_arg<main_kernel_args::args_start_index + 9,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 10, int*>> {
-    using processing_type = typename kernel_params::native_type;
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  buf_size_arg,
+                                  buf_arg<void>,
+                                  income_data_flag_arg,
+                                  ready_to_recv_flag_arg,
+                                  local_barrier_flag_arg,
+                                  right_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
+                                  root_arg,
+
+                                  // numa-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 9, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 10, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "bcast_execution_numa";
     }
 
-    //own
-    using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using buf_size_arg_type = typename buf_size_arg::arg_type;
-
-    using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using buf_arg_type = typename buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 2, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 3, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_buf_arg_type = typename right_buf_arg::arg_type;
-
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 6, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+    using common_entry_buf_arg = buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 9,
-                                                     typename kernel_params::native_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 9, void*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 10, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_bcast_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   buf_size_arg,
-                                  buf_arg,
+                                  buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_buf_arg,
+                                  right_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_bcast_ipc
-        : public ipc_kernel<ring_bcast_ipc<kernel_params>,
-                            stub_arg<main_kernel_args::args_start_index>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 1,
-                                            typename kernel_params::native_type*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 2, int*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 3, int*>,
-                            stub_arg<main_kernel_args::args_start_index + 4>,
-                            stub_arg<main_kernel_args::args_start_index + 5>,
-                            stub_arg<main_kernel_args::args_start_index + 6>,
-                            stub_arg<main_kernel_args::args_start_index + 7>> {
-    using processing_type = typename kernel_params::native_type;
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           stub_arg<main_kernel_args::args_start_index>,
+                                           buf_arg<void>,
+                                           income_data_flag_arg,
+                                           ready_to_recv_flag_arg,
+                                           stub_arg<main_kernel_args::args_start_index + 4>,
+                                           stub_arg<main_kernel_args::args_start_index + 5>,
+                                           stub_arg<main_kernel_args::args_start_index + 6>,
+                                           stub_arg<main_kernel_args::args_start_index + 7>,
+                                           stub_arg<main_kernel_args::args_start_index + 8>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "ring_bcast_ipc";
     }
 
-    using recv_buf_arg = typename ring_bcast_kernel<kernel_params>::buf_arg;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = typename ring_bcast_kernel<kernel_params>::income_data_flag_arg;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+    using common_entry_buf_arg = buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 stub_arg<main_kernel_args::args_start_index>,
+                                 buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 stub_arg<main_kernel_args::args_start_index + 4>,
+                                 stub_arg<main_kernel_args::args_start_index + 5>,
+                                 stub_arg<main_kernel_args::args_start_index + 6>,
+                                 stub_arg<main_kernel_args::args_start_index + 7>,
+                                 stub_arg<main_kernel_args::args_start_index + 8>>;
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto recv_buf = reinterpret_cast<typename buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<buf_arg<processing_type>>(recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+    }
 
-    using ready_to_recv_flag_arg =
-        typename ring_bcast_kernel<kernel_params>::ready_to_recv_flag_arg;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+    using base::base;
+};
 
-    using base = execution_kernel<ring_bcast_ipc<kernel_params>,
-                                  stub_arg<main_kernel_args::args_start_index>,
-                                  recv_buf_arg,
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  buf_size_arg,
+                                  buf_arg<void>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
-                                  stub_arg<main_kernel_args::args_start_index + 4>,
-                                  stub_arg<main_kernel_args::args_start_index + 5>,
-                                  stub_arg<main_kernel_args::args_start_index + 6>,
-                                  stub_arg<main_kernel_args::args_start_index + 7>>;
-};
+                                  local_barrier_flag_arg,
+                                  right_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
+                                  root_arg,
 
-template <class kernel_params>
-struct ring_bcast_scale_out_cpu_gw_kernel
-        : public execution_kernel<ring_bcast_scale_out_cpu_gw_kernel<kernel_params>,
-                                  arg<main_kernel_args::args_start_index, size_t>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 1,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 2, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 3, int*>,
-                                  arg<main_kernel_args::args_start_index + 4, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 5,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 6, int*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 7, int*>,
-                                  arg<main_kernel_args::args_start_index + 8, size_t>,
-
-                                  thread_safe_arg<main_kernel_args::args_start_index + 9,
-                                                  typename kernel_params::native_type*>,
-                                  thread_safe_arg<main_kernel_args::args_start_index + 10, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+                                  // scaleout-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 9, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 10, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "bcast_execution_scale_out_cpu_gw";
     }
 
-    //own
-    using buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using buf_size_arg_type = typename buf_size_arg::arg_type;
-
-    using buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using buf_arg_type = typename buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 2, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 3, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 4, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_buf_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, processing_type*>;
-    using right_buf_arg_type = typename right_buf_arg::arg_type;
-
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 6, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using root_arg = arg<main_kernel_args::args_start_index + 8, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+    using common_entry_buf_arg = buf_arg<processing_type>;
 
     // event data
     using event_prod_chunk_mem_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, processing_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 9, processing_type*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 10, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 10, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_bcast_scale_out_cpu_gw_kernel<kernel_params>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   buf_size_arg,
-                                  buf_arg,
+                                  buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_buf_arg,
+                                  right_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
+
+} // namespace bcast
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_entry_module.hpp
index c9a813038..9f3241e33 100644
--- a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/reduce_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_reduce,
                                  ccl::device_topology_type::ring,
-                                 ring_reduce_kernel,
-                                 ring_reduce_numa_kernel,
-                                 ring_reduce_scale_out_cpu_gw_kernel);
+                                 ring::reduce::main_kernel,
+                                 ring::reduce::numa_kernel,
+                                 ring::reduce::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_reduce,
                                  ccl::device_topology_type::ring,
-                                 ring_reduce_ipc,
-                                 ring_reduce_ipc,
-                                 ring_reduce_ipc);
+                                 ring::reduce::ipc_kernel,
+                                 ring::reduce::ipc_kernel,
+                                 ring::reduce::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce,
                                 ccl::device_topology_type::ring,
-                                ring_reduce_kernel,
-                                ring_reduce_numa_kernel,
-                                ring_reduce_scale_out_cpu_gw_kernel);
+                                ring::reduce::main_kernel,
+                                ring::reduce::numa_kernel,
+                                ring::reduce::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_export_functions.hpp
index 55fe5a569..a07e27087 100644
--- a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/reduce_export_functions.hpp
@@ -17,315 +17,256 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_reduce_kernel
-        : public execution_kernel<
-              ring_reduce_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              external_arg<main_kernel_args::args_start_index + 3,
-                           typename kernel_params::native_type*>,
-              external_arg<main_kernel_args::args_start_index + 4, int*>,
-              external_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 7,
-                                     typename kernel_params::native_type*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>,
-              arg<main_kernel_args::args_start_index + 10, size_t>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
 
-    static constexpr const char* specific_name() {
-        return "reduce_execution";
-    }
+namespace ring {
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+namespace reduce {
 
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
+/**
+ * Common args for all kernel types
+ */
 
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
+using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
 
-    using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
 
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+template <class native_t>
+using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>;
 
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+template <class native_t>
+using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
 
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
+using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+
+template <class native_t>
+using right_tmp_recv_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
+
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
+using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
+using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
 
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>;
+using root_arg_type = typename root_arg::arg_type;
 
-    using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel : public execution_kernel<main_kernel,
+                                             send_buf_size_arg,
+                                             send_buf_arg<void>,
+                                             recv_buf_arg<void>,
+                                             tmp_recv_buf_arg<void>,
+                                             income_data_flag_arg,
+                                             ready_to_recv_flag_arg,
+                                             local_barrier_flag_arg,
+                                             right_tmp_recv_buf_arg<void>,
+                                             right_income_data_flag_arg,
+                                             right_ready_to_recv_flag_arg,
+                                             root_arg> {
+    using processing_type = void;
 
-    using base = execution_kernel<ring_reduce_kernel<kernel_params>,
+    static constexpr const char* specific_name() {
+        return "reduce_execution";
+    }
+
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_reduce_numa_kernel
-        : public execution_kernel<
-              ring_reduce_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-              arg<main_kernel_args::args_start_index + 10, size_t>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 12, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  send_buf_size_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
+                                  income_data_flag_arg,
+                                  ready_to_recv_flag_arg,
+                                  local_barrier_flag_arg,
+                                  right_tmp_recv_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
+                                  root_arg,
+
+                                  // numa-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 11, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 12, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "reduce_execution_numa";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 10,
-                                                     typename kernel_params::native_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, void*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_reduce_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_reduce_ipc
-        : public ipc_kernel<ring_reduce_ipc<kernel_params>,
-                            stub_arg<main_kernel_args::args_start_index>,
-                            stub_arg<main_kernel_args::args_start_index + 1>,
-                            stub_arg<main_kernel_args::args_start_index + 2>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 3,
-                                            typename kernel_params::native_type*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-                            stub_arg<main_kernel_args::args_start_index + 6>,
-                            stub_arg<main_kernel_args::args_start_index + 7>,
-                            stub_arg<main_kernel_args::args_start_index + 8>,
-                            stub_arg<main_kernel_args::args_start_index + 9>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           stub_arg<main_kernel_args::args_start_index>,
+                                           stub_arg<main_kernel_args::args_start_index + 1>,
+                                           stub_arg<main_kernel_args::args_start_index + 2>,
+                                           tmp_recv_buf_arg<void>,
+                                           income_data_flag_arg,
+                                           ready_to_recv_flag_arg,
+                                           stub_arg<main_kernel_args::args_start_index + 6>,
+                                           stub_arg<main_kernel_args::args_start_index + 7>,
+                                           stub_arg<main_kernel_args::args_start_index + 8>,
+                                           stub_arg<main_kernel_args::args_start_index + 9>,
+                                           stub_arg<main_kernel_args::args_start_index + 10>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "ring_reduce_ipc";
     }
 
-    using tmp_recv_buf_arg = typename ring_reduce_kernel<kernel_params>::tmp_recv_buf_arg;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = typename ring_reduce_kernel<kernel_params>::income_data_flag_arg;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 stub_arg<main_kernel_args::args_start_index>,
+                                 stub_arg<main_kernel_args::args_start_index + 1>,
+                                 stub_arg<main_kernel_args::args_start_index + 2>,
+                                 tmp_recv_buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 stub_arg<main_kernel_args::args_start_index + 6>,
+                                 stub_arg<main_kernel_args::args_start_index + 7>,
+                                 stub_arg<main_kernel_args::args_start_index + 8>,
+                                 stub_arg<main_kernel_args::args_start_index + 9>,
+                                 stub_arg<main_kernel_args::args_start_index + 10>>;
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+    }
 
-    using ready_to_recv_flag_arg =
-        typename ring_reduce_kernel<kernel_params>::ready_to_recv_flag_arg;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+    using base::base;
+};
 
-    using base = execution_kernel<ring_reduce_ipc<kernel_params>,
-                                  stub_arg<main_kernel_args::args_start_index>,
-                                  stub_arg<main_kernel_args::args_start_index + 1>,
-                                  stub_arg<main_kernel_args::args_start_index + 2>,
-                                  tmp_recv_buf_arg,
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  send_buf_size_arg,
+                                  send_buf_arg<void>,
+                                  recv_buf_arg<void>,
+                                  tmp_recv_buf_arg<void>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
-                                  stub_arg<main_kernel_args::args_start_index + 6>,
-                                  stub_arg<main_kernel_args::args_start_index + 7>,
-                                  stub_arg<main_kernel_args::args_start_index + 8>,
-                                  stub_arg<main_kernel_args::args_start_index + 9>>;
-};
+                                  local_barrier_flag_arg,
+                                  right_tmp_recv_buf_arg<void>,
+                                  right_income_data_flag_arg,
+                                  right_ready_to_recv_flag_arg,
+                                  root_arg,
 
-template <class kernel_params>
-struct ring_reduce_scale_out_cpu_gw_kernel
-        : public execution_kernel<
-              ring_reduce_scale_out_cpu_gw_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-              arg<main_kernel_args::args_start_index + 10, size_t>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 12, int*>> {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+                                  // numa-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 11, void*>,
+                                  permanent_arg<main_kernel_args::args_start_index + 12, int*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "reduce_execution_scale_out_cpu_gw";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-    using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>;
-    using root_arg_type = typename root_arg::arg_type;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
     using event_prod_chunk_mem_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>;
+        permanent_arg<main_kernel_args::args_start_index + 11, processing_type*>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_reduce_scale_out_cpu_gw_kernel<kernel_params>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   root_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
+} // namespace reduce
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp
index 52e98a915..44ae2a55a 100644
--- a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp
+++ b/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp
@@ -23,21 +23,21 @@ DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
                                  real_gpu_typed_module,
                                  ccl_coll_reduce_scatter,
                                  ccl::device_topology_type::ring,
-                                 ring_reduce_scatter_kernel,
-                                 ring_reduce_scatter_numa_kernel,
-                                 ring_reduce_scatter_scale_out_cpu_gw_kernel);
+                                 ring::reduce_scatter::main_kernel,
+                                 ring::reduce_scatter::numa_kernel,
+                                 ring::reduce_scatter::scale_out_cpu_gw_kernel);
 
 DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
                                  ipc_gpu_typed_module,
                                  ccl_coll_reduce_scatter,
                                  ccl::device_topology_type::ring,
-                                 ring_reduce_scatter_ipc,
-                                 ring_reduce_scatter_ipc,
-                                 ring_reduce_scatter_ipc);
+                                 ring::reduce_scatter::ipc_kernel,
+                                 ring::reduce_scatter::ipc_kernel,
+                                 ring::reduce_scatter::ipc_kernel);
 
 DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce_scatter,
                                 ccl::device_topology_type::ring,
-                                ring_reduce_scatter_kernel,
-                                ring_reduce_scatter_numa_kernel,
-                                ring_reduce_scatter_scale_out_cpu_gw_kernel);
+                                ring::reduce_scatter::main_kernel,
+                                ring::reduce_scatter::numa_kernel,
+                                ring::reduce_scatter::scale_out_cpu_gw_kernel);
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp
index 5b2561d1c..f1f3789ff 100644
--- a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp
@@ -17,312 +17,262 @@
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
 namespace native {
-template <class kernel_params>
-struct ring_reduce_scatter_kernel
-        : public execution_kernel<
-              ring_reduce_scatter_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>, // recv_count
-              arg<main_kernel_args::args_start_index + 1,
-                  typename kernel_params::native_type*>, // send_buf
-              arg<main_kernel_args::args_start_index + 2,
-                  typename kernel_params::native_type*>, // recv_buf
-              external_arg<main_kernel_args::args_start_index + 3,
-                           typename kernel_params::native_type*>, // tmp_buf
-              external_arg<main_kernel_args::args_start_index + 4, int*>, // left_wrote_to_me_flag
-              external_arg<main_kernel_args::args_start_index + 5, int*>, // i_ready_to_receive_flag
-              arg<main_kernel_args::args_start_index + 6, int*>, // local_barrier_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 7,
-                                     typename kernel_params::native_type*>, // right_output_buffer
-              thread_exchangable_arg<main_kernel_args::args_start_index + 8,
-                                     typename kernel_params::native_type*>, // right_temp_buffer
-              thread_exchangable_arg<main_kernel_args::args_start_index + 9,
-                                     int*>, // i_send_to_right_flag
-              thread_exchangable_arg<main_kernel_args::args_start_index + 10,
-                                     int*>> { // right_ready_to_recv_flag
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
 
-    static constexpr const char* specific_name() {
-        return "reduce_scatter_execution";
-    }
+namespace ring {
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
+namespace reduce_scatter {
+
+/**
+ * Common args for all kernel types
+ */
+
+using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
 
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using common_entry_buf_arg = send_buf_arg;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
+// TODO: since we use only a single type, remove template parameter here
+template <class native_t>
+using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
 
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
+template <class native_t>
+using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 2, native_t*>;
 
-    using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
+template <class native_t>
+using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
 
-    using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
+using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
+using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
+using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
+using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
+using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
 
-    using right_output_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_output_buf_arg_type = typename right_output_buf_arg::arg_type;
+template <class native_t>
+using right_output_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
 
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 8, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
+template <class native_t>
+using right_tmp_recv_buf_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 8, native_t*>;
 
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;*/
+using right_income_data_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
 
-    using right_income_data_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
+using right_ready_to_recv_flag_arg =
+    thread_exchangable_arg<main_kernel_args::args_start_index + 10, int*>;
 
-    using right_ready_to_recv_flag_arg =
-        thread_exchangable_arg<main_kernel_args::args_start_index + 10, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+// IMPORTANT: the number and types of arguments must be the same in all classes,
+// excluding arguments specific for numa/scaleout etc.
+struct main_kernel
+        : public execution_kernel<main_kernel,
+                                  send_buf_size_arg, // recv_count
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  tmp_recv_buf_arg<void>, // tmp_buf
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  local_barrier_flag_arg, // local_barrier_flag
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg> { // right_ready_to_recv_flag
+    using processing_type = void;
+
+    static constexpr const char* specific_name() {
+        return "reduce_scatter_execution";
+    }
+
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
-    using base = execution_kernel<ring_reduce_scatter_kernel<kernel_params>,
+    using base = execution_kernel<main_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_output_buf_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_output_buf_arg<processing_type>,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg>;
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_reduce_scatter_numa_kernel
-        : public execution_kernel<
-              ring_reduce_scatter_numa_kernel<kernel_params>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-              arg<main_kernel_args::args_start_index + 10, size_t>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              typename kernel_params::native_type*>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct numa_kernel
+        : public execution_kernel<numa_kernel,
+                                  send_buf_size_arg, // recv_count
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  tmp_recv_buf_arg<void>, // tmp_buf
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  local_barrier_flag_arg, // local_barrier_flag
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+
+                                  // numa-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 11, size_t>,
+                                  permanent_arg<main_kernel_args::args_start_index + 12, void*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "reduce_scatter_execution_numa";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg = thread_safe_arg<main_kernel_args::args_start_index + 10,
-                                                     typename kernel_params::native_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_reduce_scatter_numa_kernel<kernel_params>,
+    using base = execution_kernel<numa_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_output_buf_arg<processing_type>,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_reduce_scatter_ipc
-        : public ipc_kernel<ring_reduce_scatter_ipc<kernel_params>,
-                            stub_arg<main_kernel_args::args_start_index>,
-                            stub_arg<main_kernel_args::args_start_index + 1>,
-                            stub_arg<main_kernel_args::args_start_index + 2>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 3,
-                                            typename kernel_params::native_type*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-                            thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-                            stub_arg<main_kernel_args::args_start_index + 6>,
-                            stub_arg<main_kernel_args::args_start_index + 7>,
-                            stub_arg<main_kernel_args::args_start_index + 8>,
-                            stub_arg<main_kernel_args::args_start_index + 9>> {
-    using param_t = kernel_params;
-    using processing_type = typename kernel_params::native_type;
+struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
+                                           stub_arg<main_kernel_args::args_start_index>,
+                                           stub_arg<main_kernel_args::args_start_index + 1>,
+                                           recv_buf_arg<void>, // recv_buf (output_buffer)
+                                           tmp_recv_buf_arg<void>, // tmp_buf
+                                           income_data_flag_arg, // left_wrote_to_me_flag
+                                           ready_to_recv_flag_arg,
+                                           stub_arg<main_kernel_args::args_start_index + 6>,
+                                           stub_arg<main_kernel_args::args_start_index + 7>,
+                                           stub_arg<main_kernel_args::args_start_index + 8>,
+                                           stub_arg<main_kernel_args::args_start_index + 9>,
+                                           stub_arg<main_kernel_args::args_start_index + 10>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "ring_reduce_scatter_ipc";
     }
 
-    using tmp_recv_buf_arg = typename ring_reduce_scatter_kernel<kernel_params>::tmp_recv_buf_arg;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg =
-        typename ring_reduce_scatter_kernel<kernel_params>::income_data_flag_arg;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg =
-        typename ring_reduce_scatter_kernel<kernel_params>::ready_to_recv_flag_arg;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
+
+    using base = base_ipc_kernel<ipc_kernel,
+                                 stub_arg<main_kernel_args::args_start_index>,
+                                 stub_arg<main_kernel_args::args_start_index + 1>,
+                                 recv_buf_arg<processing_type>,
+                                 tmp_recv_buf_arg<processing_type>,
+                                 income_data_flag_arg,
+                                 ready_to_recv_flag_arg,
+                                 stub_arg<main_kernel_args::args_start_index + 6>,
+                                 stub_arg<main_kernel_args::args_start_index + 7>,
+                                 stub_arg<main_kernel_args::args_start_index + 8>,
+                                 stub_arg<main_kernel_args::args_start_index + 9>,
+                                 stub_arg<main_kernel_args::args_start_index + 10>>;
+
+    template <class ipc_handles_t>
+    void bind_data(const ipc_handles_t& ipc_handles) {
+        auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(0).get().pointer);
+        this->template set_arg<recv_buf_arg<processing_type>>(recv_buf);
+
+        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
+            ipc_handles.at(1).get().pointer);
+        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
+
+        auto income_data_flag =
+            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(2).get().pointer);
+        this->template set_arg<income_data_flag_arg>(income_data_flag);
+
+        auto ready_to_recv_flag =
+            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(3).get().pointer);
+        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
+    }
 
-    using base = execution_kernel<ring_reduce_scatter_ipc<kernel_params>,
-                                  stub_arg<main_kernel_args::args_start_index>,
-                                  stub_arg<main_kernel_args::args_start_index + 1>,
-                                  stub_arg<main_kernel_args::args_start_index + 2>,
-                                  tmp_recv_buf_arg,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  stub_arg<main_kernel_args::args_start_index + 6>,
-                                  stub_arg<main_kernel_args::args_start_index + 7>,
-                                  stub_arg<main_kernel_args::args_start_index + 8>,
-                                  stub_arg<main_kernel_args::args_start_index + 9>>;
+    using base::base;
 };
 
-template <class kernel_params>
-struct ring_reduce_scatter_scale_out_cpu_gw_kernel
-        : public execution_kernel<
-              ring_reduce_scatter_scale_out_cpu_gw_kernel<typename kernel_params::native_type>,
-              arg<main_kernel_args::args_start_index, size_t>,
-              arg<main_kernel_args::args_start_index + 1, typename kernel_params::native_type*>,
-              arg<main_kernel_args::args_start_index + 2, typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
-              arg<main_kernel_args::args_start_index + 6, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 7,
-                              typename kernel_params::native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 9, int*>,
-              arg<main_kernel_args::args_start_index + 10, size_t>,
-              thread_safe_arg<main_kernel_args::args_start_index + 11,
-                              typename kernel_params::native_type*>> {
-    using param_t = kernel_params;
-    using processing_type = typename param_t::native_type;
+struct scale_out_cpu_gw_kernel
+        : public execution_kernel<scale_out_cpu_gw_kernel,
+                                  send_buf_size_arg, // recv_count
+                                  send_buf_arg<void>, // send_buf
+                                  recv_buf_arg<void>, // recv_buf (output_buffer)
+                                  tmp_recv_buf_arg<void>, // tmp_buf
+                                  income_data_flag_arg, // left_wrote_to_me_flag
+                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
+                                  local_barrier_flag_arg, // local_barrier_flag
+                                  right_output_buf_arg<void>, // right_output_buffer
+                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
+                                  right_income_data_flag_arg, // i_send_to_right_flag
+                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
+
+                                  // scaleout-specific args
+                                  permanent_arg<main_kernel_args::args_start_index + 11, size_t>,
+                                  permanent_arg<main_kernel_args::args_start_index + 12, void*>> {
+    using processing_type = void;
 
     static constexpr const char* specific_name() {
         return "reduce_scatter_execution_scale_out_cpu_gw";
     }
 
-    //own
-    using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-    using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-    using send_buf_arg = arg<main_kernel_args::args_start_index + 1, processing_type*>;
-    using send_buf_arg_type = typename send_buf_arg::arg_type;
-
-    using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, processing_type*>;
-    using recv_buf_arg_type = typename recv_buf_arg::arg_type;
-
-    using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
-    using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
-
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
-    using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
-    using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-    using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-    using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-    //right
-    using right_tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 7, processing_type*>;
-    using right_tmp_recv_buf_arg_type = typename right_tmp_recv_buf_arg::arg_type;
-
-    /*  using right_recv_buf_arg =                  thread_safe_arg<main_kernel_args::args_start_index + 8, void *>;
-    using right_recv_buf_arg_type =             typename right_recv_buf_arg::arg_type;
-*/
-    using right_income_data_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 8, int*>;
-    using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-    using right_ready_to_recv_flag_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 9, int*>;
-    using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
+    using common_entry_buf_size_arg = send_buf_size_arg;
+    using common_entry_buf_arg = send_buf_arg<processing_type>;
 
     // event data
-    using event_prod_chunk_mem_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 10, processing_type*>;
+    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>;
     using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
 
-    using event_prod_bytes_arg = thread_safe_arg<main_kernel_args::args_start_index + 11, int*>;
+    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>;
     using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
 
-    using base = execution_kernel<ring_reduce_scatter_scale_out_cpu_gw_kernel<kernel_params>,
+    using base = execution_kernel<scale_out_cpu_gw_kernel,
                                   send_buf_size_arg,
-                                  send_buf_arg,
-                                  recv_buf_arg,
-                                  tmp_recv_buf_arg,
+                                  send_buf_arg<processing_type>,
+                                  recv_buf_arg<processing_type>,
+                                  tmp_recv_buf_arg<processing_type>,
                                   income_data_flag_arg,
                                   ready_to_recv_flag_arg,
                                   local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg,
+                                  right_output_buf_arg<processing_type>,
+                                  right_tmp_recv_buf_arg<processing_type>,
                                   right_income_data_flag_arg,
                                   right_ready_to_recv_flag_arg,
                                   event_prod_chunk_mem_arg,
                                   event_prod_bytes_arg>;
+
+    template <class ctx_params_t>
+    void bind_data(const ctx_params_t& out_ctx_params) {
+        // TODO not implemented
+        (void)out_ctx_params;
+        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
+    }
+
+    using base::base;
 };
 
+} // namespace reduce_scatter
+} // namespace ring
 } // namespace native
diff --git a/src/common/comm/l0/modules/supported_modules.hpp b/src/common/comm/l0/modules/supported_modules.hpp
index 30adb80f4..6c6b0348b 100644
--- a/src/common/comm/l0/modules/supported_modules.hpp
+++ b/src/common/comm/l0/modules/supported_modules.hpp
@@ -47,4 +47,6 @@ using supported_topology_device_modules = std::tuple<
 template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
           class module_impl>
 using supported_device_modules = supported_topology_device_modules<module_impl, CCL_COLL_TYPE_LIST>;
+
+using supported_device_modules1 = std::array<int, 1>;
 } // namespace native
diff --git a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
index 4373120cd..612a62306 100644
--- a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
+++ b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
@@ -30,10 +30,10 @@
 #include "common/comm/l0/context/device_storage.hpp"
 
 /*REFACTORING*/
-#include "common/comm/l0/context/scaling_ctx/numa_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_up_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/scale_out_ctx_impl.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp"
 /*REFACTORING*/
 
 namespace native {
diff --git a/src/common/comm/single_device_communicator/single_device_communicator.cpp b/src/common/comm/single_device_communicator/single_device_communicator.cpp
index 69c32ceb0..a24f4a803 100644
--- a/src/common/comm/single_device_communicator/single_device_communicator.cpp
+++ b/src/common/comm/single_device_communicator/single_device_communicator.cpp
@@ -80,7 +80,7 @@ ccl::event single_device_communicator::barrier(const ccl::stream::impl_value_t&
                                                const ccl::vector_class<ccl::event>& deps) {
     // TODO what exactly we need to do with 'attr' here?
 
-    ccl_barrier_impl(comm_impl.get(), op_stream.get());
+    ccl_barrier_impl(comm_impl.get(), op_stream.get(), deps);
 
     // TODO what exactly we need to return here? ccl_barrier_impl() is void func
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(nullptr));
@@ -96,23 +96,6 @@ ccl::event single_device_communicator::allgatherv_base_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl_coll_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(
         new ccl::host_event_impl(ccl_allgatherv_impl(send_buf,
                                                      send_count,
@@ -121,7 +104,8 @@ ccl::event single_device_communicator::allgatherv_base_impl(
                                                      dtype,
                                                      attr,
                                                      comm_impl.get(),
-                                                     stream_handle))));
+                                                     stream.get(),
+                                                     deps))));
 }
 
 ccl::event single_device_communicator::allgatherv_impl(const void* send_buf,
@@ -166,25 +150,8 @@ ccl::event single_device_communicator::allreduce_impl(const void* send_buf,
                                                       const ccl::stream::impl_value_t& stream,
                                                       const ccl::allreduce_attr& attr,
                                                       const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_allreduce_impl(
-        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream_handle))));
+        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream.get(), deps))));
 }
 
 /* alltoall */
@@ -195,25 +162,8 @@ ccl::event single_device_communicator::alltoall_impl(const void* send_buf,
                                                      const ccl::stream::impl_value_t& stream,
                                                      const ccl::alltoall_attr& attr,
                                                      const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_alltoall_impl(
-        send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream_handle))));
+        send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream.get(), deps))));
 }
 
 ccl::event single_device_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
@@ -236,23 +186,6 @@ ccl::event single_device_communicator::alltoallv_impl(const void* send_buf,
                                                       const ccl::stream::impl_value_t& stream,
                                                       const ccl::alltoallv_attr& attr,
                                                       const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(
         new ccl::host_event_impl(ccl_alltoallv_impl(send_buf,
                                                     send_counts.data(),
@@ -261,7 +194,8 @@ ccl::event single_device_communicator::alltoallv_impl(const void* send_buf,
                                                     dtype,
                                                     attr,
                                                     comm_impl.get(),
-                                                    stream_handle))));
+                                                    stream.get(),
+                                                    deps))));
 }
 ccl::event single_device_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
                                                       const ccl::vector_class<size_t>& send_counts,
@@ -283,25 +217,8 @@ ccl::event single_device_communicator::broadcast_impl(void* buf,
                                                       const ccl::stream::impl_value_t& stream,
                                                       const ccl::broadcast_attr& attr,
                                                       const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(
-        ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream_handle))));
+        ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream.get(), deps))));
 }
 
 /* reduce */
@@ -314,25 +231,17 @@ ccl::event single_device_communicator::reduce_impl(const void* send_buf,
                                                    const ccl::stream::impl_value_t& stream,
                                                    const ccl::reduce_attr& attr,
                                                    const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
-    return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_impl(
-        send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), stream_handle))));
+    return ccl::event(
+        std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_impl(send_buf,
+                                                                                  recv_buf,
+                                                                                  count,
+                                                                                  dtype,
+                                                                                  reduction,
+                                                                                  root,
+                                                                                  attr,
+                                                                                  comm_impl.get(),
+                                                                                  stream.get(),
+                                                                                  deps))));
 }
 
 /* reduce_scatter */
@@ -345,26 +254,16 @@ ccl::event single_device_communicator::reduce_scatter_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
-    return ccl::event(std::unique_ptr<
-                      ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_scatter_impl(
-        send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), stream_handle))));
+    return ccl::event(std::unique_ptr<ccl::event_impl>(
+        new ccl::host_event_impl(ccl_reduce_scatter_impl(send_buf,
+                                                         recv_buf,
+                                                         recv_count,
+                                                         dtype,
+                                                         reduction,
+                                                         attr,
+                                                         comm_impl.get(),
+                                                         stream.get(),
+                                                         deps))));
 }
 
 /* sparse_allreduce */
@@ -383,25 +282,6 @@ ccl::event single_device_communicator::sparse_allreduce_impl(
     const ccl::stream::impl_value_t& stream,
     const ccl::sparse_allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    using namespace ::native::detail;
-
-    std::vector<void*> bufs = {
-        (void*)send_ind_buf, (void*)send_val_buf, recv_ind_buf, recv_val_buf
-    };
-    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
-
-    const ccl_stream* stream_handle = nullptr;
-
-    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
-    }
-    else if (mode == usm_support_mode::need_conversion)
-#ifdef CCL_ENABLE_SYCL
-        stream_handle = stream.get();
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                             " - USM convertation is not supported for such configuration");
-#endif
-
     return ccl::event(std::unique_ptr<ccl::event_impl>(
         new ccl::host_event_impl(ccl_sparse_allreduce_impl(send_ind_buf,
                                                            send_ind_count,
@@ -416,7 +296,8 @@ ccl::event single_device_communicator::sparse_allreduce_impl(
                                                            reduction,
                                                            attr,
                                                            comm_impl.get(),
-                                                           stream_handle))));
+                                                           stream.get(),
+                                                           deps))));
 }
 
 COMM_INTERFACE_COLL_INSTANTIATION(single_device_communicator);
diff --git a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
index 8431bd8dd..a57ca84c9 100644
--- a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
+++ b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
@@ -93,7 +93,9 @@ ccl::event single_device_communicator::allgatherv_impl(const buffer_type& send_b
                                            ccl::native_type_info<buffer_type>::dtype,
                                            attr,
                                            comm_impl.get(),
-                                           stream.get());
+                                           stream.get(),
+                                           deps,
+                                           true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 template <class buffer_type>
@@ -143,7 +145,9 @@ ccl::event single_device_communicator::allreduce_impl(const buffer_type& send_bu
                                           reduction,
                                           attr,
                                           comm_impl.get(),
-                                          stream.get());
+                                          stream.get(),
+                                          deps,
+                                          true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -184,7 +188,9 @@ ccl::event single_device_communicator::alltoall_impl(const buffer_type& send_buf
                                          ccl::native_type_info<buffer_type>::dtype,
                                          attr,
                                          comm_impl.get(),
-                                         stream.get());
+                                         stream.get(),
+                                         deps,
+                                         true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -247,7 +253,9 @@ ccl::event single_device_communicator::alltoallv_impl(const buffer_type& send_bu
                                           ccl::native_type_info<buffer_type>::dtype,
                                           attr,
                                           comm_impl.get(),
-                                          stream.get());
+                                          stream.get(),
+                                          deps,
+                                          true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -289,7 +297,9 @@ ccl::event single_device_communicator::broadcast_impl(buffer_type& buf,
                                           root,
                                           attr,
                                           comm_impl.get(),
-                                          stream.get());
+                                          stream.get(),
+                                          deps,
+                                          true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -333,7 +343,9 @@ ccl::event single_device_communicator::reduce_impl(const buffer_type& send_buf,
                                        root,
                                        attr,
                                        comm_impl.get(),
-                                       stream_ptr);
+                                       stream_ptr,
+                                       deps,
+                                       true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -374,7 +386,9 @@ ccl::event single_device_communicator::reduce_scatter_impl(
                                                reduction,
                                                attr,
                                                comm_impl.get(),
-                                               stream_ptr);
+                                               stream_ptr,
+                                               deps,
+                                               true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
@@ -439,6 +453,8 @@ ccl::event single_device_communicator::sparse_allreduce_impl(
                                   reduction,
                                   attr,
                                   comm_impl.get(),
-                                  stream_ptr);
+                                  stream_ptr,
+                                  deps,
+                                  true);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
diff --git a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
index ab16a1679..523abb41b 100644
--- a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
@@ -44,128 +44,11 @@ struct allreduce_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+        req = get_self()->template allreduce_impl<uint8_t>((const uint8_t*)(const void*)send_buf,
+                                                           (uint8_t*)(void*)recv_buf,
+                                                           count,
+                                                           std::forward<Args>(args)...);
 
-        switch (dtype) {
-            case ccl::datatype::int8: {
-                using type = int8_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::uint8: {
-                using type = uint8_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::int16: {
-                using type = int16_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::uint16: {
-                using type = uint16_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::int32: {
-                using type = int32_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::uint32: {
-                using type = uint32_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::int64: {
-                using type = int64_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::uint64: {
-                using type = uint64_t;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::float16: {
-                using type = ccl::float16;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::float32: {
-                using type = float;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::float64: {
-                using type = double;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            case ccl::datatype::bfloat16: {
-                using type = ccl::bfloat16;
-                req = get_self()->template allreduce_impl<type>(static_cast<const type*>(send_buf),
-                                                                static_cast<type*>(recv_buf),
-                                                                count,
-                                                                std::forward<Args>(args)...);
-                processed = true;
-                break;
-            }
-            default: {
-                CCL_THROW("unknown datatype ", dtype);
-                LOG_DEBUG("comm: ",
-                          /*get_self()->to_string(),*/
-                          " - no found visitor for datatype: ",
-                          ccl::to_string(dtype),
-                          " , handle: ",
-                          utils::enum_to_underlying(dtype),
-                          ", use RAW types");
-                break;
-            }
-        }
         return processed;
     }
 };
diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp
index c311870aa..849b363ca 100644
--- a/src/common/datatype/datatype.cpp
+++ b/src/common/datatype/datatype.cpp
@@ -90,18 +90,18 @@ ccl_datatype_storage::ccl_datatype_storage() {
                                                   : 0;
 
         CCL_ASSERT(size > 0, "Unexpected data type size: ", size, ", for idx: ", idx);
-        name_str = (idx == ccl::datatype::int8)       ? "INT8"
-                   : (idx == ccl::datatype::uint8)    ? "UINT8"
-                   : (idx == ccl::datatype::int16)    ? "INT16"
-                   : (idx == ccl::datatype::uint16)   ? "UINT16"
-                   : (idx == ccl::datatype::int32)    ? "INT32"
-                   : (idx == ccl::datatype::uint32)   ? "UINT32"
-                   : (idx == ccl::datatype::int64)    ? "INT64"
-                   : (idx == ccl::datatype::uint64)   ? "UINT64"
-                   : (idx == ccl::datatype::float16)  ? "FP16"
-                   : (idx == ccl::datatype::float32)  ? "FP32"
-                   : (idx == ccl::datatype::float64)  ? "FP64"
-                   : (idx == ccl::datatype::bfloat16) ? "BF16"
+        name_str = (idx == ccl::datatype::int8)       ? "int8"
+                   : (idx == ccl::datatype::uint8)    ? "uint8"
+                   : (idx == ccl::datatype::int16)    ? "int16"
+                   : (idx == ccl::datatype::uint16)   ? "uint16"
+                   : (idx == ccl::datatype::int32)    ? "int32"
+                   : (idx == ccl::datatype::uint32)   ? "uint32"
+                   : (idx == ccl::datatype::int64)    ? "int64"
+                   : (idx == ccl::datatype::uint64)   ? "uint64"
+                   : (idx == ccl::datatype::float16)  ? "float16"
+                   : (idx == ccl::datatype::float32)  ? "float32"
+                   : (idx == ccl::datatype::float64)  ? "float64"
+                   : (idx == ccl::datatype::bfloat16) ? "bfloat16"
                                                       : 0;
 
         create_internal(predefined_table, idx, size, name_str);
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index f612aeb5d..e9ffa8f88 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -13,8 +13,10 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include <climits>
 #include <dlfcn.h>
 #include <iterator>
+#include <memory>
 #include <sstream>
 #include <unistd.h>
 
@@ -44,6 +46,12 @@ std::map<ccl_staging_buffer, std::string> env_data::staging_buffer_names = {
     std::make_pair(ccl_staging_usm, "usm")
 };
 
+std::map<atl_mnic_t, std::string> env_data::mnic_type_names = {
+    std::make_pair(ATL_MNIC_NONE, "none"),
+    std::make_pair(ATL_MNIC_LOCAL, "local"),
+    std::make_pair(ATL_MNIC_GLOBAL, "global")
+};
+
 env_data::env_data()
         : was_printed(false),
 
@@ -58,8 +66,13 @@ env_data::env_data()
 
           atl_transport(ccl_atl_mpi),
           enable_shm(0),
-          sync_coll(0),
-          extra_ep(0),
+          enable_rma(0),
+          enable_device_buf(0),
+          enable_sync_coll(0),
+          enable_extra_ep(0),
+
+          mnic_type(ATL_MNIC_NONE),
+          mnic_count(CCL_ENV_SIZET_NOT_SPECIFIED),
 
           enable_unordered_coll(0),
 
@@ -69,14 +82,13 @@ env_data::env_data()
           fusion_check_urgent(1),
           fusion_cycle_ms(0.2),
 
-          enable_rma(0),
           priority_mode(ccl_priority_none),
           spin_count(100),
           yield_type(ccl_yield_pause),
           max_short_size(0),
           bcast_part_count(CCL_ENV_SIZET_NOT_SPECIFIED),
           cache_key_type(ccl_cache_key_match_id),
-          enable_cache_flush(1),
+          enable_cache_flush(0),
           enable_strict_order(0),
           staging_buffer(ccl_staging_usm),
 
@@ -95,6 +107,7 @@ env_data::env_data()
 
           enable_comm_kernels(0),
           comm_kernels_path(),
+          comm_kernels_debug(0),
           gpu_thread_count(CCL_ENV_SIZET_NOT_SPECIFIED),
 
           bf16_impl_type(ccl_bf16_no_compiler_support),
@@ -123,8 +136,8 @@ void env_data::parse() {
 
     if (fw_type == ccl_framework_horovod) {
         worker_wait = 1;
-        sync_coll = 1;
-        extra_ep = 1;
+        enable_sync_coll = 1;
+        enable_extra_ep = 1;
         yield_type = ccl_yield_sched_yield;
     }
 
@@ -135,8 +148,16 @@ void env_data::parse() {
 
     env_2_atl_transport();
     env_2_type(CCL_ATL_SHM, enable_shm);
-    env_2_type(CCL_ATL_SYNC_COLL, sync_coll);
-    env_2_type(CCL_ATL_EXTRA_EP, extra_ep);
+    env_2_type(CCL_ATL_RMA, enable_rma);
+    env_2_type(CCL_ATL_DEVICE_BUF, enable_device_buf);
+    env_2_type(CCL_ATL_SYNC_COLL, enable_sync_coll);
+    env_2_type(CCL_ATL_EXTRA_EP, enable_extra_ep);
+
+    env_2_enum(CCL_MNIC, mnic_type_names, mnic_type);
+    env_2_type(CCL_MNIC_COUNT, mnic_count);
+    if (mnic_count == CCL_ENV_SIZET_NOT_SPECIFIED) {
+        mnic_count = worker_count;
+    }
 
     env_2_type(CCL_ALLGATHERV, allgatherv_algo_raw);
     env_2_type(CCL_ALLREDUCE, allreduce_algo_raw);
@@ -176,7 +197,6 @@ void env_data::parse() {
     if (worker_wait)
         spin_count = 1000;
 
-    env_2_type(CCL_RMA, enable_rma);
     env_2_enum(CCL_PRIORITY, priority_mode_names, priority_mode);
     env_2_type(CCL_SPIN_COUNT, spin_count);
     env_2_enum(CCL_YIELD, ccl_yield_type_names, yield_type);
@@ -219,29 +239,7 @@ void env_data::parse() {
             CCL_THROW_IF_NOT(!ccl_root.empty(), "incorrect comm kernels path, CCL_ROOT not found!");
             comm_kernels_path = ccl_root + "/lib/kernels/";
         }
-
-        // TODO remove IPC workaround knobs
-        if (!getenv("DisableStatelessToStatefulOptimization")) {
-            setenv("DisableStatelessToStatefulOptimization", "1", 1);
-            LOG_WARN(
-                "environment variable 'DisableStatelessToStatefulOptimization' is not set, will be used DisableStatelessToStatefulOptimization=1");
-        }
-        if (!getenv("CFESingleSliceDispatchCCSMode")) {
-            setenv("CFESingleSliceDispatchCCSMode", "1", 1);
-            LOG_WARN(
-                "environment variable 'CFESingleSliceDispatchCCSMode' is not set, will be used CFESingleSliceDispatchCCSMode=1");
-        }
-        if (!getenv("OverrideStatelessMocsIndex")) {
-            setenv("OverrideStatelessMocsIndex", "2", 1);
-            LOG_WARN(
-                "environment variable 'OverrideStatelessMocsIndex' is not set, will be used OverrideStatelessMocsIndex=2");
-        }
-
-        if (!getenv("CCL_KVS_GET_TIMEOUT")) {
-            setenv("CCL_KVS_GET_TIMEOUT", "10", 1);
-            LOG_WARN(
-                "environment variable 'CCL_KVS_GET_TIMEOUT' is not set, will be used CCL_KVS_GET_TIMEOUT=10");
-        }
+        env_2_type(CCL_COMM_KERNELS_DEBUG, comm_kernels_debug);
     }
     env_2_type(CCL_GPU_THREAD_COUNT, gpu_thread_count);
 
@@ -278,8 +276,25 @@ void env_data::print(int rank) {
     else
         was_printed = true;
 
-    auto& global_data = ccl::global_data::get();
+    if (rank == 0) {
+        auto version = utils::get_library_version();
+        LOG_INFO("library version: ", version.full);
+        LOG_INFO("specification version: ", ONECCL_SPEC_VERSION);
+#ifdef CCL_ENABLE_SYCL
+        LOG_INFO("compute backend: ", version.cl_backend_name);
+#endif /* CCL_ENABLE_SYCL */
 
+#ifdef ENABLE_DEBUG
+        const char* build_mode = "debug";
+#else /* ENABLE_DEBUG */
+        const char* build_mode = "release";
+#endif /* ENABLE_DEBUG */
+        LOG_INFO("build mode: ", build_mode);
+        LOG_INFO("C compiler: ", CCL_C_COMPILER);
+        LOG_INFO("C++ compiler: ", CCL_CXX_COMPILER);
+    }
+
+    auto& global_data = ccl::global_data::get();
     auto local_proc_idx = global_data.executor->get_local_proc_idx();
     auto local_proc_count = global_data.executor->get_local_proc_count();
 
@@ -311,8 +326,13 @@ void env_data::print(int rank) {
 
     LOG_INFO(CCL_ATL_TRANSPORT, ": ", str_by_enum(atl_transport_names, atl_transport));
     LOG_INFO(CCL_ATL_SHM, ": ", enable_shm);
-    LOG_DEBUG(CCL_ATL_SYNC_COLL, ": ", sync_coll);
-    LOG_DEBUG(CCL_ATL_EXTRA_EP, ": ", extra_ep);
+    LOG_INFO(CCL_ATL_RMA, ": ", enable_rma);
+    LOG_INFO(CCL_ATL_DEVICE_BUF, ": ", enable_device_buf);
+    LOG_DEBUG(CCL_ATL_SYNC_COLL, ": ", enable_sync_coll);
+    LOG_DEBUG(CCL_ATL_EXTRA_EP, ": ", enable_extra_ep);
+
+    LOG_INFO(CCL_MNIC, ": ", str_by_enum(mnic_type_names, mnic_type));
+    LOG_INFO(CCL_MNIC_COUNT, ": ", mnic_count);
 
     LOG_INFO(CCL_ALLGATHERV,
              ": ",
@@ -349,7 +369,6 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_FUSION_CHECK_URGENT, ": ", fusion_check_urgent);
     LOG_INFO(CCL_FUSION_CYCLE_MS, ": ", fusion_cycle_ms);
 
-    LOG_INFO(CCL_RMA, ": ", enable_rma);
     LOG_INFO(CCL_PRIORITY, ": ", str_by_enum(priority_mode_names, priority_mode));
     LOG_INFO(CCL_SPIN_COUNT, ": ", spin_count);
     LOG_INFO(CCL_YIELD, ": ", str_by_enum(ccl_yield_type_names, yield_type));
@@ -384,33 +403,21 @@ void env_data::print(int rank) {
                  : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_ALLTOALL_SCATTER_PLAIN, ": ", alltoall_scatter_plain);
 
+#ifdef CCL_ENABLE_SYCL
     LOG_INFO(CCL_COMM_KERNELS, ": ", enable_comm_kernels);
     LOG_INFO(CCL_COMM_KERNELS_PATH,
              ": ",
              (!comm_kernels_path.empty()) ? comm_kernels_path : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_COMM_KERNELS_DEBUG, ": ", comm_kernels_debug);
     LOG_INFO(CCL_GPU_THREAD_COUNT,
              ": ",
              (gpu_thread_count != CCL_ENV_SIZET_NOT_SPECIFIED) ? std::to_string(gpu_thread_count)
                                                                : CCL_ENV_STR_NOT_SPECIFIED);
+#endif /* CCL_ENABLE_SYCL  */
 
     LOG_INFO(CCL_BF16, ": ", str_by_enum(bf16_impl_names, bf16_impl_type));
     LOG_INFO(CCL_FP16, ": ", str_by_enum(fp16_impl_names, fp16_impl_type));
 
-#ifdef ENABLE_DEBUG
-    const char* build_mode = "debug";
-#else
-    const char* build_mode = "release";
-#endif
-    LOG_INFO("build mode: ", build_mode);
-
-    LOG_INFO("C compiler: ", CCL_C_COMPILER);
-    LOG_INFO("C++ compiler: ", CCL_CXX_COMPILER);
-
-    auto version = utils::get_library_version();
-    LOG_INFO("library version: ", version.full);
-
-    LOG_INFO("specification version: ", ONECCL_SPEC_VERSION);
-
     char* ccl_root = getenv("CCL_ROOT");
     LOG_INFO("CCL_ROOT: ", (ccl_root) ? ccl_root : CCL_ENV_STR_NOT_SPECIFIED);
 
@@ -430,6 +437,9 @@ void env_data::print(int rank) {
 void env_data::set_internal_env() {
     auto attr = ccl_executor::generate_atl_attr(*this);
     atl_wrapper::set_internal_env(attr);
+    if (log_level >= ccl_log_level::info) {
+        setenv("I_MPI_DEBUG", "4", 0);
+    }
 }
 
 int env_data::env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_per_process) {
@@ -487,35 +497,59 @@ int env_data::env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_p
     return 1;
 }
 
+int env_data::parse_core_id(const std::string& core_id_str, size_t& result) {
+    char* end_ptr;
+    const char* core_id_str_ptr = core_id_str.c_str();
+
+    errno = 0;
+    auto core_id = std::strtol(core_id_str_ptr, &end_ptr, 10);
+
+    if ((errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN)) ||
+        (errno != 0 && core_id == 0)) {
+        LOG_ERROR("core id value is invalid in string: ", core_id_str);
+        return 0;
+    }
+    if (end_ptr == core_id_str_ptr) {
+        LOG_ERROR("no digits were found in string: ", core_id_str);
+        return 0;
+    }
+    if (core_id < 0) {
+        LOG_ERROR(
+            "core id cannot be less than zero but got ", core_id, " in string: ", core_id_str);
+        return 0;
+    }
+    result = core_id;
+    return 1;
+}
+
 int env_data::env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_count) {
     CCL_THROW_IF_NOT(local_proc_count > 0);
 
-    int read_env = 0;
-    size_t w_idx, read_count = 0;
-    char* affinity_copy = nullptr;
+    size_t idx;
+    std::unique_ptr<char> affinity_copy;
     char* affinity_to_parse = getenv(CCL_WORKER_AFFINITY);
-    char* proc_id_str;
+    char* core_range_str;
     char* tmp;
-    size_t proccessor_count;
+    size_t system_core_count;
 
     size_t affinity_size = local_proc_count * worker_count;
-    worker_affinity.assign(affinity_size, 0);
 
     if (!affinity_to_parse || (strlen(affinity_to_parse) == 0) ||
         (strcmp(affinity_to_parse, "auto") == 0)) {
+        worker_affinity.assign(affinity_size, 0);
         if (std::getenv(I_MPI_AVAILABLE_CORES_ENV)) {
             /* generate auto affinity based on IMPI process pinning */
             return env_2_worker_affinity_auto(local_proc_idx, worker_count);
         }
         else {
             /* generate auto affinity as last N cores */
-            proccessor_count = sysconf(_SC_NPROCESSORS_ONLN);
-            for (w_idx = 0; w_idx < affinity_size; w_idx++) {
-                if (w_idx < proccessor_count) {
-                    worker_affinity[w_idx] = proccessor_count - w_idx - 1;
+            system_core_count = sysconf(_SC_NPROCESSORS_ONLN);
+            for (idx = 0; idx < affinity_size; idx++) {
+                if (idx < system_core_count) {
+                    worker_affinity[idx] = system_core_count - idx - 1;
                 }
                 else {
-                    worker_affinity[w_idx] = worker_affinity[w_idx % proccessor_count];
+                    worker_affinity[idx] = worker_affinity[idx % system_core_count];
                 }
             }
             return 1;
@@ -524,47 +558,58 @@ int env_data::env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_cou
 
     /* create copy of original buffer because it will be modified in strsep */
     size_t affinity_len = strlen(affinity_to_parse);
-    affinity_copy = static_cast<char*>(CCL_CALLOC(affinity_len + 1, "affinity_copy"));
-    CCL_MEMCPY(affinity_copy, affinity_to_parse, affinity_len);
-    tmp = affinity_copy;
-
-    for (w_idx = 0; w_idx < affinity_size; w_idx++) {
-        proc_id_str = strsep(&tmp, ",");
-        if (proc_id_str != NULL) {
-            if (atoi(proc_id_str) < 0) {
-                LOG_ERROR(
-                    "unexpected proc_id ", proc_id_str, ", affinity string ", affinity_to_parse);
-                read_env = 0;
-                CCL_FREE(affinity_copy);
-                return read_env;
-            }
-            worker_affinity[w_idx] = std::strtoul(proc_id_str, nullptr, 10);
-            read_count++;
+    affinity_copy =
+        std::unique_ptr<char>(static_cast<char*>(CCL_CALLOC(affinity_len + 1, "affinity_copy")));
+    CCL_MEMCPY(affinity_copy.get(), affinity_to_parse, affinity_len);
+    tmp = affinity_copy.get();
+
+    while (tmp) {
+        core_range_str = strsep(&tmp, ",");
+        if (!core_range_str) {
+            break;
         }
-        else {
-            LOG_ERROR("unexpected end of affinity string, expected ",
-                      affinity_size,
-                      " numbers, read ",
-                      read_count,
-                      ", affinity string ",
-                      affinity_to_parse);
-            read_env = 0;
-            CCL_FREE(affinity_copy);
-            return read_env;
+
+        auto core_range = tokenize<std::vector<std::string>>(std::string(core_range_str), '-');
+
+        if ((core_range.size() != 2) && (core_range.size() != 1)) {
+            LOG_ERROR(
+                "unexpected format in affinity: ",
+                affinity_to_parse,
+                ", specify core range using <first_core>-<last_core> or single core using <core>");
+            return 0;
+        }
+
+        if (core_range.size() == 1) {
+            /* to unify logic below */
+            core_range.push_back(*core_range.begin());
+        }
+
+        CCL_ASSERT(core_range.size() == 2, "unexpected number of cores in range");
+
+        size_t first_core, last_core;
+        if (!parse_core_id(core_range[0], first_core) || !parse_core_id(core_range[1], last_core)) {
+            return 0;
+        }
+
+        if (first_core > last_core) {
+            LOG_ERROR("unexpected first and last cores in range: ",
+                      core_range_str,
+                      ", first core should be less or equal to last core");
+            return 0;
+        }
+
+        for (idx = first_core; idx <= last_core; idx++) {
+            worker_affinity.push_back(idx);
         }
     }
-    if (read_count < affinity_size) {
-        LOG_ERROR(
-            "unexpected number of processors (specify 1 logical processor per 1 worker thread), affinity string ",
-            affinity_to_parse);
-        read_env = 0;
-        CCL_FREE(affinity_copy);
-        return read_env;
-    }
-    read_env = 1;
 
-    CCL_FREE(affinity_copy);
-    return read_env;
+    if (worker_affinity.size() < affinity_size) {
+        LOG_ERROR("unexpected number of cores in affinity: ",
+                  affinity_to_parse,
+                  ", specify 1 core per 1 worker thread");
+        return 0;
+    }
+    return 1;
 }
 
 void env_data::env_2_atl_transport() {
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index 102000187..00e708de2 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -49,9 +49,14 @@ constexpr const char* I_MPI_AVAILABLE_CORES_DELIMS = ",x";
 
 constexpr const char* CCL_ATL_TRANSPORT = "CCL_ATL_TRANSPORT";
 constexpr const char* CCL_ATL_SHM = "CCL_ATL_SHM";
+constexpr const char* CCL_ATL_RMA = "CCL_ATL_RMA";
+constexpr const char* CCL_ATL_DEVICE_BUF = "CCL_ATL_DEVICE_BUF";
 constexpr const char* CCL_ATL_SYNC_COLL = "CCL_ATL_SYNC_COLL";
 constexpr const char* CCL_ATL_EXTRA_EP = "CCL_ATL_EXTRA_EP";
 
+constexpr const char* CCL_MNIC = "CCL_MNIC";
+constexpr const char* CCL_MNIC_COUNT = "CCL_MNIC_COUNT";
+
 constexpr const char* CCL_ALLGATHERV = "CCL_ALLGATHERV";
 constexpr const char* CCL_ALLREDUCE = "CCL_ALLREDUCE";
 constexpr const char* CCL_ALLTOALL = "CCL_ALLTOALL";
@@ -69,7 +74,6 @@ constexpr const char* CCL_FUSION_COUNT_THRESHOLD = "CCL_FUSION_COUNT_THRESHOLD";
 constexpr const char* CCL_FUSION_CHECK_URGENT = "CCL_FUSION_CHECK_URGENT";
 constexpr const char* CCL_FUSION_CYCLE_MS = "CCL_FUSION_CYCLE_MS";
 
-constexpr const char* CCL_RMA = "CCL_RMA";
 constexpr const char* CCL_PRIORITY = "CCL_PRIORITY";
 constexpr const char* CCL_SPIN_COUNT = "CCL_SPIN_COUNT";
 constexpr const char* CCL_YIELD = "CCL_YIELD";
@@ -95,32 +99,17 @@ constexpr const char* CCL_ALLTOALL_SCATTER_PLAIN = "CCL_ALLTOALL_SCATTER_PLAIN";
 
 constexpr const char* CCL_COMM_KERNELS = "CCL_COMM_KERNELS";
 constexpr const char* CCL_COMM_KERNELS_PATH = "CCL_COMM_KERNELS_PATH";
+constexpr const char* CCL_COMM_KERNELS_DEBUG = "CCL_COMM_KERNELS_DEBUG";
 constexpr const char* CCL_GPU_THREAD_COUNT = "CCL_GPU_THREAD_COUNT";
 
 constexpr const char* CCL_BF16 = "CCL_BF16";
 constexpr const char* CCL_FP16 = "CCL_FP16";
 
-enum ccl_priority_mode {
-    ccl_priority_none,
-    ccl_priority_direct,
-    ccl_priority_lifo,
-
-    ccl_priority_last_value
-};
-
-enum ccl_atl_transport {
-    ccl_atl_ofi,
-    ccl_atl_mpi,
-
-    ccl_atl_last_value
-};
+enum ccl_priority_mode { ccl_priority_none, ccl_priority_direct, ccl_priority_lifo };
 
-enum ccl_staging_buffer {
-    ccl_staging_regular,
-    ccl_staging_usm,
+enum ccl_atl_transport { ccl_atl_ofi, ccl_atl_mpi };
 
-    ccl_staging_last_value
-};
+enum ccl_staging_buffer { ccl_staging_regular, ccl_staging_usm };
 
 namespace ccl {
 
@@ -154,8 +143,13 @@ class env_data {
 
     ccl_atl_transport atl_transport;
     int enable_shm;
-    int sync_coll;
-    int extra_ep;
+    int enable_rma;
+    int enable_device_buf;
+    int enable_sync_coll;
+    int enable_extra_ep;
+
+    atl_mnic_t mnic_type;
+    ssize_t mnic_count;
 
     /*
        parsing logic can be quite complex
@@ -179,7 +173,6 @@ class env_data {
     int fusion_check_urgent;
     float fusion_cycle_ms;
 
-    int enable_rma;
     ccl_priority_mode priority_mode;
     size_t spin_count;
     ccl_yield_type yield_type;
@@ -205,6 +198,7 @@ class env_data {
 
     int enable_comm_kernels;
     std::string comm_kernels_path;
+    int comm_kernels_debug;
     ssize_t gpu_thread_count;
 
     ccl_bf16_impl_type bf16_impl_type;
@@ -280,12 +274,14 @@ class env_data {
     static std::map<ccl_priority_mode, std::string> priority_mode_names;
     static std::map<ccl_atl_transport, std::string> atl_transport_names;
     static std::map<ccl_staging_buffer, std::string> staging_buffer_names;
+    static std::map<atl_mnic_t, std::string> mnic_type_names;
 
     int env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_count);
     void env_2_atl_transport();
 
 private:
     int env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_per_process);
+    int parse_core_id(const std::string& core_id_str, size_t& result);
 };
 
 } /* namespace ccl */
diff --git a/src/common/event/ccl_event.cpp b/src/common/event/ccl_event.cpp
index af63d787b..855536d07 100644
--- a/src/common/event/ccl_event.cpp
+++ b/src/common/event/ccl_event.cpp
@@ -23,39 +23,6 @@ ccl_event::ccl_event(event_native_t& event, const ccl::library_version& version)
           command_type_val(),
           command_execution_status_val() {}
 
-ccl_event::ccl_event(event_native_handle_t event,
-                     event_native_context_t context,
-                     const ccl::library_version& version)
-        : version(version),
-          command_type_val(),
-          command_execution_status_val() {
-#ifdef CCL_ENABLE_SYCL
-    native_event = event_native_t{ event, context };
-#else
-    //TODO
-    throw;
-#endif
-}
-
-void ccl_event::build_from_params() {
-    if (!creation_is_postponed) {
-        throw ccl::exception("error");
-    }
-#ifdef CCL_ENABLE_SYCL
-    /* TODO unavailbale??
-    event_native_t event_candidate{native_context};
-    std::swap(event_candidate, native_event); //TODO USE attributes fro sycl queue construction
-    */
-
-    throw ccl::exception("build_from_attr is not availbale for sycl::event");
-#else
-
-    //TODO use attributes
-
-#endif
-    creation_is_postponed = false;
-}
-
 //Export Attributes
 typename ccl_event::version_traits_t::type ccl_event::set_attribute_value(
     typename version_traits_t::type val,
@@ -75,11 +42,6 @@ typename ccl_event::native_handle_traits_t::return_type& ccl_event::get_attribut
     return native_event;
 }
 
-typename ccl_event::context_traits_t::return_type& ccl_event::get_attribute_value(
-    const context_traits_t& id) {
-    return native_context;
-}
-
 typename ccl_event::command_type_traits_t::type ccl_event::set_attribute_value(
     typename command_type_traits_t::type val,
     const command_type_traits_t& t) {
diff --git a/src/common/event/ccl_event.hpp b/src/common/event/ccl_event.hpp
index 9185e8912..281fc2263 100644
--- a/src/common/event/ccl_event.hpp
+++ b/src/common/event/ccl_event.hpp
@@ -33,17 +33,11 @@ class alignas(CACHELINE_SIZE) ccl_event {
     using event_native_handle_t = typename ccl::unified_event_type::handle_t;
     using event_native_t = typename ccl::unified_event_type::ccl_native_t;
 
-    using event_native_context_handle_t = typename ccl::unified_context_type::handle_t;
-    using event_native_context_t = typename ccl::unified_context_type::ccl_native_t;
-
     ccl_event() = delete;
     ccl_event(const ccl_event& other) = delete;
     ccl_event& operator=(const ccl_event& other) = delete;
 
     ccl_event(event_native_t& event, const ccl::library_version& version);
-    ccl_event(event_native_handle_t event,
-              event_native_context_t context,
-              const ccl::library_version& version);
     ~ccl_event() = default;
 
     //Export Attributes
@@ -61,10 +55,6 @@ class alignas(CACHELINE_SIZE) ccl_event {
     typename native_handle_traits_t::return_type& get_attribute_value(
         const native_handle_traits_t& id);
 
-    using context_traits_t =
-        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::context>;
-    typename context_traits_t::return_type& get_attribute_value(const context_traits_t& id);
-
     using command_type_traits_t =
         ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::command_type>;
     typename command_type_traits_t::return_type set_attribute_value(
@@ -89,7 +79,6 @@ class alignas(CACHELINE_SIZE) ccl_event {
 private:
     const ccl::library_version version;
     event_native_t native_event;
-    event_native_context_t native_context;
     bool creation_is_postponed{ false };
 
     typename command_type_traits_t::return_type command_type_val;
diff --git a/src/common/framework/framework.hpp b/src/common/framework/framework.hpp
index 7dddd4cda..8b00d88ea 100644
--- a/src/common/framework/framework.hpp
+++ b/src/common/framework/framework.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <map>
+#include <string>
 
 typedef int (*ccl_horovod_init_function)(const int*, int);
 extern ccl_horovod_init_function horovod_init_function;
diff --git a/src/common/request/request.cpp b/src/common/request/request.cpp
index b460fd3c9..d34f46fa6 100644
--- a/src/common/request/request.cpp
+++ b/src/common/request/request.cpp
@@ -27,7 +27,7 @@ ccl_request::~ccl_request() {
     auto counter = completion_counter.load(std::memory_order_acquire);
     LOG_DEBUG("delete req ", this, " with counter ", counter);
     if (counter != 0 && !ccl::global_data::get().is_ft_enabled) {
-        LOG_ERROR("unexpected completion_counter ", counter);
+        LOG_WARN("unexpected completion_counter ", counter);
     }
 }
 
diff --git a/src/common/utils/tuple.hpp b/src/common/utils/tuple.hpp
index 327c274ca..2159324bc 100644
--- a/src/common/utils/tuple.hpp
+++ b/src/common/utils/tuple.hpp
@@ -122,12 +122,12 @@ void ccl_tuple_for_each_args(specific_tuple&& t, functor&& f, args_t&&... args)
 }
 
 template <typename specific_tuple, size_t cur_index, typename functor, class... FunctionArgs>
-void ccl_tuple_for_each_indexed_impl(functor,
+void ccl_tuple_for_each_indexed_impl(functor&,
                                      std::true_type tuple_finished,
                                      const FunctionArgs&... args) {}
 
 template <typename specific_tuple, size_t cur_index, typename functor, class... FunctionArgs>
-void ccl_tuple_for_each_indexed_impl(functor f,
+void ccl_tuple_for_each_indexed_impl(functor& f,
                                      std::false_type tuple_not_finished,
                                      const FunctionArgs&... args) {
     using tuple_element_t = typename std::tuple_element<cur_index, specific_tuple>::type;
@@ -144,7 +144,7 @@ void ccl_tuple_for_each_indexed_impl(functor f,
 }
 
 template <typename specific_tuple, typename functor, class... FunctionArgs>
-void ccl_tuple_for_each_indexed(functor f, const FunctionArgs&... args) {
+void ccl_tuple_for_each_indexed(functor& f, const FunctionArgs&... args) {
     constexpr std::size_t tuple_size =
         std::tuple_size<typename std::remove_reference<specific_tuple>::type>::value;
     static_assert(tuple_size != 0, "Nothing to do, tuple is empty");
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index 4be5d3c3d..ab3412570 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -57,8 +57,10 @@
 #endif
 
 #define CACHELINE_SIZE 64
-#define ONE_MB         1048576
-#define TWO_MB         2097152
+
+#define CCL_REG_MSG_ALIGNMENT   (4096)
+#define CCL_LARGE_MSG_ALIGNMENT (2 * 1024 * 1024)
+#define CCL_LARGE_MSG_THRESHOLD (1 * 1024 * 1024)
 
 #define CCL_MEMCPY(dest, src, n) std::copy((char*)(src), (char*)(src) + (n), (char*)(dest))
 
@@ -103,7 +105,10 @@
 
 #define CCL_MALLOC_WRAPPER(size, name) \
     ({ \
-        void* ptr = CCL_MEMALIGN_IMPL(size, (size < TWO_MB) ? CACHELINE_SIZE : TWO_MB); \
+        size_t alignment = CCL_REG_MSG_ALIGNMENT; \
+        if (size >= CCL_LARGE_MSG_THRESHOLD) \
+            alignment = CCL_LARGE_MSG_ALIGNMENT; \
+        void* ptr = CCL_MEMALIGN_IMPL(size, alignment); \
         CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
         ptr; \
     })
diff --git a/src/comp/bf16/bf16.cpp b/src/comp/bf16/bf16.cpp
index 3f46792cc..116e8ec14 100644
--- a/src/comp/bf16/bf16.cpp
+++ b/src/comp/bf16/bf16.cpp
@@ -54,7 +54,7 @@ void ccl_bf16_reduce(const void* in_buf,
 void ccl_convert_fp32_to_bf16(const void* src, void* dst) {
 #ifdef CCL_BF16_AVX512BF_COMPILER
     if (ccl::global_data::env().bf16_impl_type == ccl_bf16_avx512bf) {
-        _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
+        _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
     }
     else
 #endif
@@ -120,15 +120,15 @@ void ccl_bf16_reduce(const void* in_buf,
                      void* inout_buf,
                      size_t* out_cnt,
                      ccl::reduction reduction_op) {
-    CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support");
+    CCL_FATAL("BF16 reduction was requested but CCL was compiled w/o BF16 support");
 }
 
 void ccl_convert_fp32_to_bf16_arrays(void* fp32_buf, void* bf16_buf, size_t count) {
-    CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support");
+    CCL_FATAL("FP32->BF16 conversion was requested but CCL was compiled w/o BF16 support");
 }
 
 void ccl_convert_bf16_to_fp32_arrays(void* bf16_buf, float* fp32_buf, size_t count) {
-    CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support");
+    CCL_FATAL("BF16->FP32 conversion was requested but CCL was compiled w/o BF16 support");
 }
 
 #endif /* CCL_BF16_COMPILER */
diff --git a/src/comp/bf16/bf16.hpp b/src/comp/bf16/bf16.hpp
index 229841d28..18d6c797b 100644
--- a/src/comp/bf16/bf16.hpp
+++ b/src/comp/bf16/bf16.hpp
@@ -26,13 +26,13 @@ __attribute__((target("avx512bw,avx512vl")))
 void ccl_bf16_reduce(const void* in_buf, size_t in_cnt,
                      void* inout_buf, size_t* out_cnt,
                      ccl::reduction reduction_op);
-#else
+#else /* CCL_BF16_TARGET_ATTRIBUTES */
 void ccl_bf16_reduce(const void* in_buf,
                      size_t in_cnt,
                      void* inout_buf,
                      size_t* out_cnt,
                      ccl::reduction reduction_op);
-#endif
+#endif /* CCL_BF16_TARGET_ATTRIBUTES */
 
 void ccl_convert_fp32_to_bf16_arrays(void*, void*, size_t);
 void ccl_convert_bf16_to_fp32_arrays(void*, float*, size_t);
@@ -46,7 +46,7 @@ void ccl_convert_fp32_to_bf16(const void* src, void* dst)
 #else
 void ccl_convert_fp32_to_bf16(const void* src, void* dst) __attribute__((target("avx512bw")));
 #endif
-#endif
+#endif /* CCL_BF16_TARGET_ATTRIBUTES */
 
 #ifdef CCL_BF16_TARGET_ATTRIBUTES
 #ifdef CCL_BF16_AVX512BF_COMPILER
@@ -55,6 +55,6 @@ void ccl_convert_bf16_to_fp32(const void* src, void* dst)
 #else
 void ccl_convert_bf16_to_fp32(const void* src, void* dst) __attribute__((target("avx512bw")));
 #endif
-#endif
+#endif /* CCL_BF16_TARGET_ATTRIBUTES */
 
 #endif /* CCL_BF16_COMPILER */
diff --git a/src/comp/bf16/bf16_intrisics.hpp b/src/comp/bf16/bf16_intrisics.hpp
index 4455113c8..e452aab9c 100644
--- a/src/comp/bf16/bf16_intrisics.hpp
+++ b/src/comp/bf16/bf16_intrisics.hpp
@@ -71,7 +71,7 @@ BF16_INLINE_TARGET_ATTRIBUTE_BW void ccl_fp32_store_as_bf16_avx512f(const void*
 
 #ifdef CCL_BF16_AVX512BF_COMPILER
 BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* src, void* dst) {
-    _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
+    _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
 }
 #endif
 
@@ -96,9 +96,11 @@ BF16_INLINE_TARGET_ATTRIBUTE void ccl_fp32_store_as_bf16_avx512bf(const void* sr
         if (len == 0) \
             return; \
         uint16_t mask = ((uint16_t)0xFFFF) >> (CCL_BF16_IN_M256 - len); \
-        __m256i vbf16_out; \
-        ccl_bf16_reduce_inputs_##impl_type(in, inout, (void*)&vbf16_out, op); \
-        _mm256_mask_storeu_epi16(inout, (__mmask16)mask, vbf16_out); \
+        __m256i a = _mm256_maskz_loadu_epi16(mask, in); \
+        __m256i b = _mm256_maskz_loadu_epi16(mask, inout); \
+        __m256i res; \
+        ccl_bf16_reduce_inputs_##impl_type(&a, &b, &res, op); \
+        _mm256_mask_storeu_epi16(inout, (__mmask16)mask, res); \
     } \
 \
     BF16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_bf16_reduce_impl_##impl_type( \
diff --git a/src/comp/comp.cpp b/src/comp/comp.cpp
index f9eb9905a..8bee9f353 100644
--- a/src/comp/comp.cpp
+++ b/src/comp/comp.cpp
@@ -20,6 +20,11 @@
 #include "common/log/log.hpp"
 #include "common/global/global.hpp"
 #include "common/utils/enums.hpp"
+#include "sched/queue/queue.hpp"
+
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif /* CCL_ENABLE_SYCL */
 
 #define CCL_REDUCE(type) \
     do { \
@@ -60,14 +65,14 @@ ccl::status ccl_comp_copy(const void* in_buf,
     return ccl::status::success;
 }
 
-ccl::status ccl_comp_reduce(const void* in_buf,
-                            size_t in_count,
-                            void* inout_buf,
-                            size_t* out_count,
-                            const ccl_datatype& dtype,
-                            ccl::reduction reduction,
-                            ccl::reduction_fn reduction_fn,
-                            const ccl::fn_context* context) {
+ccl::status ccl_comp_reduce_regular(const void* in_buf,
+                                    size_t in_count,
+                                    void* inout_buf,
+                                    size_t* out_count,
+                                    const ccl_datatype& dtype,
+                                    ccl::reduction reduction,
+                                    ccl::reduction_fn reduction_fn,
+                                    const ccl::fn_context* context) {
     if (reduction == ccl::reduction::custom) {
         CCL_THROW_IF_NOT(reduction_fn, "custom reduction requires user callback");
         reduction_fn(in_buf, in_count, inout_buf, out_count, dtype.idx(), context);
@@ -97,6 +102,76 @@ ccl::status ccl_comp_reduce(const void* in_buf,
     return ccl::status::success;
 }
 
+ccl::status ccl_comp_reduce(ccl_sched* sched,
+                            const void* in_buf,
+                            size_t in_count,
+                            void* inout_buf,
+                            size_t* out_count,
+                            const ccl_datatype& dtype,
+                            ccl::reduction reduction,
+                            ccl::reduction_fn reduction_fn,
+                            const ccl::fn_context* context) {
+#ifdef CCL_ENABLE_SYCL
+    ccl_stream* stream = (ccl_stream*)sched->coll_param.stream;
+
+    if (!stream) {
+        return ccl_comp_reduce_regular(
+            in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context);
+    }
+
+    sycl::queue* q = stream->get_native_stream(sched->queue->get_idx());
+    CCL_THROW_IF_NOT(q, "null sycl queue");
+    auto in_ptr_type = sycl::get_pointer_type(in_buf, q->get_context());
+    auto inout_ptr_type = sycl::get_pointer_type(inout_buf, q->get_context());
+
+    LOG_DEBUG("in_ptr_type: ",
+              native::detail::usm_to_string(in_ptr_type),
+              ", inout_ptr_type: ",
+              native::detail::usm_to_string(inout_ptr_type),
+              ", native_stream: ",
+              stream->to_string(),
+              ", in_count: ",
+              in_count)
+
+    if ((in_ptr_type != sycl::usm::alloc::device) && (inout_ptr_type != sycl::usm::alloc::device)) {
+        return ccl_comp_reduce_regular(
+            in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context);
+    }
+
+    void* host_in_buf = (void*)in_buf;
+    void* host_inout_buf = inout_buf;
+    size_t bytes = in_count * dtype.size();
+
+    if (in_ptr_type == sycl::usm::alloc::device) {
+        host_in_buf = CCL_MALLOC(bytes, "host_in_buf");
+        q->memcpy(host_in_buf, in_buf, bytes).wait();
+    }
+
+    if (inout_ptr_type == sycl::usm::alloc::device) {
+        host_inout_buf = CCL_MALLOC(bytes, "host_inout_buf");
+        q->memcpy(host_inout_buf, inout_buf, bytes).wait();
+    }
+
+    ccl_comp_reduce_regular(
+        host_in_buf, in_count, host_inout_buf, out_count, dtype, reduction, reduction_fn, context);
+
+    if (host_in_buf != in_buf) {
+        CCL_FREE(host_in_buf);
+    }
+
+    if (host_inout_buf != inout_buf) {
+        q->memcpy(inout_buf, host_inout_buf, bytes).wait();
+        CCL_FREE(host_inout_buf);
+    }
+
+    return ccl::status::success;
+
+#else /* CCL_ENABLE_SYCL */
+    return ccl_comp_reduce_regular(
+        in_buf, in_count, inout_buf, out_count, dtype, reduction, reduction_fn, context);
+#endif /* CCL_ENABLE_SYCL */
+}
+
 ccl::status ccl_comp_batch_reduce(const void* in_buf,
                                   const std::vector<size_t>& offsets,
                                   size_t in_count,
@@ -118,28 +193,28 @@ ccl::status ccl_comp_batch_reduce(const void* in_buf,
         for (size_t i = 1; i < offsets.size(); i++) {
             ccl_convert_bf16_to_fp32_arrays(
                 (char*)in_buf + dtype.size() * offsets[i], tmp, in_count);
-            ccl_comp_reduce(tmp,
-                            in_count,
-                            acc,
-                            out_count,
-                            ccl::global_data::get().dtypes->get(ccl::datatype::float32),
-                            reduction,
-                            reduction_fn,
-                            context);
+            ccl_comp_reduce_regular(tmp,
+                                    in_count,
+                                    acc,
+                                    out_count,
+                                    ccl::global_data::get().dtypes->get(ccl::datatype::float32),
+                                    reduction,
+                                    reduction_fn,
+                                    context);
         }
 
         ccl_convert_fp32_to_bf16_arrays(acc, inout_buf, in_count);
     }
     else {
         for (size_t i = 1; i < offsets.size(); i++) {
-            ccl_comp_reduce((char*)in_buf + dtype.size() * offsets[i],
-                            in_count,
-                            inout_buf,
-                            out_count,
-                            dtype,
-                            reduction,
-                            reduction_fn,
-                            context);
+            ccl_comp_reduce_regular((char*)in_buf + dtype.size() * offsets[i],
+                                    in_count,
+                                    inout_buf,
+                                    out_count,
+                                    dtype,
+                                    reduction,
+                                    reduction_fn,
+                                    context);
         }
     }
 
diff --git a/src/comp/comp.hpp b/src/comp/comp.hpp
index 10170717d..fd9124d51 100644
--- a/src/comp/comp.hpp
+++ b/src/comp/comp.hpp
@@ -16,14 +16,16 @@
 #pragma once
 
 #include "common/datatype/datatype.hpp"
-#include "oneapi/ccl/types.hpp"
 #include "internal_types.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "sched/sched.hpp"
 
 ccl::status ccl_comp_copy(const void* in_buf,
                           void* out_buf,
                           size_t count,
                           const ccl_datatype& dtype);
-ccl::status ccl_comp_reduce(const void* in_buf,
+ccl::status ccl_comp_reduce(ccl_sched* sched,
+                            const void* in_buf,
                             size_t in_count,
                             void* inout_buf,
                             size_t* out_count,
diff --git a/src/comp/fp16/fp16.cpp b/src/comp/fp16/fp16.cpp
index 74ccf0cea..f9ac7641f 100644
--- a/src/comp/fp16/fp16.cpp
+++ b/src/comp/fp16/fp16.cpp
@@ -65,7 +65,15 @@ void ccl_fp16_reduce(const void* in_buf,
                      void* inout_buf,
                      size_t* out_cnt,
                      ccl::reduction op) {
-    CCL_FATAL("FP16 reduction is requested but CCL was compiled w/o FP16 support");
+    CCL_FATAL("FP16 reduction was requested but CCL was compiled w/o FP16 support");
+}
+
+void ccl_convert_fp32_to_fp16(const void* src, void* dst) {
+    CCL_FATAL("FP32->FP16 conversion was requested but CCL was compiled w/o FP16 support");
+}
+
+void ccl_convert_fp16_to_fp32(const void* src, void* dst) {
+    CCL_FATAL("FP16->FP32 conversion was requested but CCL was compiled w/o FP16 support");
 }
 
 #endif /* CCL_FP16_COMPILER */
diff --git a/src/comp/fp16/fp16.hpp b/src/comp/fp16/fp16.hpp
index e5cd37d9b..62e66ac3d 100644
--- a/src/comp/fp16/fp16.hpp
+++ b/src/comp/fp16/fp16.hpp
@@ -18,22 +18,19 @@
 #include "oneapi/ccl/types.hpp"
 
 #ifdef CCL_FP16_TARGET_ATTRIBUTES
-__attribute__((target("avx512f,f16c"))) void ccl_fp16_reduce(const void* in_buf,
-                                                             size_t in_cnt,
-                                                             void* inout_buf,
-                                                             size_t* out_cnt,
-                                                             ccl::reduction reduction_op);
+__attribute__((target("avx512bw,avx512vl,f16c"))) void ccl_fp16_reduce(const void* in_buf,
+                                                                       size_t in_cnt,
+                                                                       void* inout_buf,
+                                                                       size_t* out_cnt,
+                                                                       ccl::reduction reduction_op);
+__attribute__((target("f16c"))) void ccl_convert_fp32_to_fp16(const void* src, void* dst);
+__attribute__((target("f16c"))) void ccl_convert_fp16_to_fp32(const void* src, void* dst);
 #else /* CCL_FP16_TARGET_ATTRIBUTES */
 void ccl_fp16_reduce(const void* in_buf,
                      size_t in_cnt,
                      void* inout_buf,
                      size_t* out_cnt,
                      ccl::reduction reduction_op);
+void ccl_convert_fp32_to_fp16(const void* src, void* dst);
+void ccl_convert_fp16_to_fp32(const void* src, void* dst);
 #endif /* CCL_FP16_TARGET_ATTRIBUTES */
-
-#ifdef CCL_FP16_COMPILER
-#ifdef CCL_FP16_TARGET_ATTRIBUTES
-void ccl_convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c")));
-void ccl_convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c")));
-#endif /* CCL_FP16_TARGET_ATTRIBUTES */
-#endif /* CCL_FP16_COMPILER */
diff --git a/src/comp/fp16/fp16_intrisics.hpp b/src/comp/fp16/fp16_intrisics.hpp
index 3b6ad8faf..4b2f4b28e 100644
--- a/src/comp/fp16/fp16_intrisics.hpp
+++ b/src/comp/fp16/fp16_intrisics.hpp
@@ -19,6 +19,7 @@
 
 #include <immintrin.h>
 #include <inttypes.h>
+#include <string.h>
 
 #include "common/global/global.hpp"
 #include "comp/fp16/fp16_utils.hpp"
@@ -28,7 +29,7 @@
 #define CCL_FP16_STEP_256 8
 
 #ifdef CCL_FP16_TARGET_ATTRIBUTES
-#define FP16_ALL_ATTRS                    "f16c,avx512f"
+#define FP16_ALL_ATTRS                    "f16c,avx512f,avx512bw,avx512vl"
 #define FP16_TARGET_ATTRIBUTE_F16C        __attribute__((target("f16c")))
 #define FP16_TARGET_ATTRIBUTE_AVX512      __attribute__((target("avx512f")))
 #define FP16_TARGET_ATTRIBUTE_ALL         __attribute__((target(FP16_ALL_ATTRS)))
@@ -91,6 +92,21 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_256(
     _mm_storeu_si128((__m128i*)(res), _mm256_cvtps_ph(vfp32_out, 0));
 }
 
+FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_256(const void* in,
+                                                               void* inout,
+                                                               uint8_t len,
+                                                               ccl_fp16_reduction_func_ptr_256 op) {
+    if (len == 0)
+        return;
+    uint16_t a[CCL_FP16_STEP_256];
+    uint16_t b[CCL_FP16_STEP_256];
+    uint16_t res[CCL_FP16_STEP_256];
+    memcpy(a, in, len * sizeof(uint16_t));
+    memcpy(b, inout, len * sizeof(uint16_t));
+    ccl_fp16_reduce_inputs_256(a, b, res, op);
+    memcpy(inout, res, len * sizeof(uint16_t));
+}
+
 FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_512(
     const void* a,
     const void* b,
@@ -103,24 +119,26 @@ FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_inputs_512(
     _mm256_storeu_si256((__m256i*)(res), _mm512_cvtps_ph(vfp32_out, 0));
 }
 
+FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_512(const void* in,
+                                                               void* inout,
+                                                               uint8_t len,
+                                                               ccl_fp16_reduction_func_ptr_512 op) {
+    if (len == 0)
+        return;
+    uint16_t mask = ((uint16_t)0xFFFF) >> (CCL_FP16_STEP_512 - len);
+    __m256i a = _mm256_maskz_loadu_epi16(mask, in);
+    __m256i b = _mm256_maskz_loadu_epi16(mask, inout);
+    __m256i res;
+    ccl_fp16_reduce_inputs_512(&a, &b, &res, op);
+    _mm256_mask_storeu_epi16(inout, (__mmask16)mask, res);
+}
+
 #define CCL_FP16_DEFINE_REDUCE_FUNC(VLEN) \
 \
     FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_main_##VLEN( \
         const void* in, const void* inout, ccl_fp16_reduction_func_ptr_##VLEN op) { \
         ccl_fp16_reduce_inputs_##VLEN(in, inout, (void*)inout, op); \
     } \
-\
-    FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_tile_##VLEN( \
-        const void* in, void* inout, uint8_t len, ccl_fp16_reduction_func_ptr_##VLEN op) { \
-        if (len == 0) \
-            return; \
-        uint16_t fp16_res[CCL_FP16_STEP_##VLEN]; \
-        ccl_fp16_reduce_inputs_##VLEN(in, inout, fp16_res, op); \
-        uint16_t* inout_ptr = (uint16_t*)inout; \
-        for (int i = 0; i < len; i++) { \
-            inout_ptr[i] = fp16_res[i]; \
-        } \
-    } \
 \
     FP16_INLINE_TARGET_ATTRIBUTE_ALL void ccl_fp16_reduce_impl_##VLEN( \
         const void* in_buf, \
diff --git a/src/comp/fp16/fp16_utils.hpp b/src/comp/fp16/fp16_utils.hpp
index d24270fd7..315060c80 100644
--- a/src/comp/fp16/fp16_utils.hpp
+++ b/src/comp/fp16/fp16_utils.hpp
@@ -48,10 +48,13 @@ __attribute__((__always_inline__)) inline std::set<ccl_fp16_impl_type> ccl_fp16_
 
     /* AVX512 capabilities for FP16 implementation */
     /* CPUID.(EAX=07H, ECX=0):EBX.AVX512F  [bit 16] */
+    /* CPUID.(EAX=07H, ECX=0):EBX.AVX512BW [bit 30] */
+    /* CPUID.(EAX=07H, ECX=0):EBX.AVX512VL [bit 31] */
     __asm__ __volatile__("cpuid"
                          : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                          : "a"(7), "c"(0));
-    is_avx512f_enabled = ((reg[1] & (1 << 16)) >> 16);
+    is_avx512f_enabled =
+        ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31);
 
     if (is_avx512f_enabled)
         result.insert(ccl_fp16_avx512f);
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index e48540ebb..2d86bfe6e 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -42,18 +42,22 @@ size_t ccl_executor::calculate_atl_ep_count(size_t worker_count) {
 
 atl_attr_t ccl_executor::generate_atl_attr(const ccl::env_data& env) {
     atl_attr_t attr;
-
-    attr.ep_count = calculate_atl_ep_count(env.worker_count);
-    attr.enable_shm = env.enable_shm;
+    attr.in.enable_shm = env.enable_shm;
     /*
         TODO:
         executor may be destroyed before cached rma-based schedule made memory deregistration
         need to refactor global objects dependencies
         don't use ring_rma till that
     */
-    attr.enable_rma = 0; // env.enable_rma;
-    attr.sync_coll = env.sync_coll;
-    attr.extra_ep = env.extra_ep;
+    attr.in.enable_rma = 0; // env.enable_rma;
+    attr.in.enable_device_buf = env.enable_device_buf;
+    attr.in.enable_sync_coll = env.enable_sync_coll;
+    attr.in.enable_extra_ep = env.enable_extra_ep;
+    attr.in.ep_count = calculate_atl_ep_count(env.worker_count);
+    attr.in.mnic_type = env.mnic_type;
+    attr.in.mnic_count = env.mnic_count;
+
+    memset(&attr.out, 0, sizeof(attr.out));
 
     return attr;
 }
@@ -147,9 +151,13 @@ ccl_executor::~ccl_executor() {
             }
             else
                 LOG_DEBUG("stopped worker # ", idx);
+        }
 
-            workers[idx].reset();
+        while (!workers[idx]->can_reset()) {
+            ccl_yield(ccl::global_data::env().yield_type);
         }
+
+        workers[idx].reset();
     }
 }
 
diff --git a/src/exec/thread/base_thread.hpp b/src/exec/thread/base_thread.hpp
index 097ee8a22..85e4c7f65 100644
--- a/src/exec/thread/base_thread.hpp
+++ b/src/exec/thread/base_thread.hpp
@@ -47,6 +47,10 @@ class ccl_base_thread {
     ccl::status start(int affinity);
     ccl::status stop();
 
+    virtual bool can_reset() {
+        return true;
+    }
+
     size_t get_idx() {
         return idx;
     }
diff --git a/src/exec/thread/service_worker.cpp b/src/exec/thread/service_worker.cpp
index 5e35c9bd3..4d6aae70d 100644
--- a/src/exec/thread/service_worker.cpp
+++ b/src/exec/thread/service_worker.cpp
@@ -21,7 +21,21 @@ ccl_service_worker::ccl_service_worker(size_t idx,
         : ccl_worker(idx, std::move(data_queue)),
           fusion_manager(fusion_manager) {}
 
+ccl_service_worker::~ccl_service_worker() {
+    fusion_manager.reset();
+}
+
 ccl::status ccl_service_worker::do_work(size_t& processed_count) {
     fusion_manager.execute();
     return ccl_worker::do_work(processed_count);
 }
+
+bool ccl_service_worker::can_reset() {
+    /* skip ATL processing since it may be already destroyed */
+    /* make only local processing */
+    process_atl = false;
+
+    size_t processed_count;
+    do_work(processed_count);
+    return fusion_manager.can_reset();
+}
diff --git a/src/exec/thread/service_worker.hpp b/src/exec/thread/service_worker.hpp
index 3006e470c..a9267df29 100644
--- a/src/exec/thread/service_worker.hpp
+++ b/src/exec/thread/service_worker.hpp
@@ -24,9 +24,11 @@ class ccl_service_worker : public ccl_worker {
     ccl_service_worker(size_t idx,
                        std::unique_ptr<ccl_sched_queue> data_queue,
                        ccl_fusion_manager& fusion_manager);
-    ~ccl_service_worker() = default;
+    ~ccl_service_worker();
 
-    ccl::status do_work(size_t& processed_count);
+    ccl::status do_work(size_t& processed_count) override;
+
+    bool can_reset() override;
 
 private:
     ccl_fusion_manager& fusion_manager;
diff --git a/src/exec/thread/worker.cpp b/src/exec/thread/worker.cpp
index c90ef9aaa..25eea9ea1 100644
--- a/src/exec/thread/worker.cpp
+++ b/src/exec/thread/worker.cpp
@@ -28,6 +28,7 @@ ccl_worker::ccl_worker(size_t idx, std::unique_ptr<ccl_sched_queue> queue)
         : ccl_base_thread(idx, ccl_worker_func),
           should_lock(false),
           is_locked(false),
+          process_atl(true),
           strict_sched_queue(std::unique_ptr<ccl_strict_sched_queue>(new ccl_strict_sched_queue())),
           sched_queue(std::move(queue)) {}
 
@@ -167,11 +168,13 @@ ccl::status ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed_
 
     /* ensure communication progress */
 
-    for (size_t sched_idx = 0; sched_idx < 1 /*bin_size*/; sched_idx++) {
-        ccl_sched* sched = bin->get(sched_idx);
-        ccl_comm* comm = sched->coll_param.comm;
-        atl_status_t atl_status = comm->atl->atl_ep_poll(bin->get_atl_ep());
-        CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status);
+    if (process_atl) {
+        for (size_t sched_idx = 0; sched_idx < 1; sched_idx++) {
+            ccl_sched* sched = bin->get(sched_idx);
+            ccl_comm* comm = sched->coll_param.comm;
+            atl_status_t atl_status = comm->atl->atl_ep_poll(bin->get_atl_ep());
+            CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status);
+        }
     }
 
     //    if (ccl::global_data::get().is_ft_enabled) {
diff --git a/src/exec/thread/worker.hpp b/src/exec/thread/worker.hpp
index 79c96532d..168f691ae 100644
--- a/src/exec/thread/worker.hpp
+++ b/src/exec/thread/worker.hpp
@@ -55,6 +55,7 @@ class ccl_worker : public ccl_base_thread {
 
     std::atomic<bool> should_lock;
     std::atomic<bool> is_locked;
+    bool process_atl;
 
     void update_wait_condition(ccl_base_thread::wait_data::update_type type, size_t delta);
 
diff --git a/src/fusion/fusion.cpp b/src/fusion/fusion.cpp
index 669b24085..7385f9982 100644
--- a/src/fusion/fusion.cpp
+++ b/src/fusion/fusion.cpp
@@ -123,8 +123,7 @@ ccl_fusion_manager::~ccl_fusion_manager() {
              ", overlapped_exec_calls ",
              stat_overlapped_exec_calls);
 
-    while (!tracked_scheds.empty())
-        check_tracked_scheds(true);
+    reset();
 
     CCL_ASSERT(postponed_queue.empty() && exec_queue.empty() && tracked_scheds.empty(),
                "queues are not empty, ",
@@ -135,8 +134,24 @@ ccl_fusion_manager::~ccl_fusion_manager() {
                tracked_scheds.size());
 }
 
+bool ccl_fusion_manager::can_reset() {
+    check_tracked_scheds(true);
+    return tracked_scheds.empty();
+}
+
+void ccl_fusion_manager::reset() {
+    while (tracked_scheds.size())
+        check_tracked_scheds(true);
+}
+
 bool ccl_fusion_manager::can_fuse(ccl_master_sched* sched) {
+    if (atl_wrapper::attr.out.enable_device_buf) {
+        /* TODO: implement fusion with D2D copies */
+        return false;
+    }
+
     size_t bytes = sched->coll_param.count * sched->coll_param.dtype.size();
+
     if (bytes >= bytes_threshold) {
         LOG_DEBUG("can't fuse due to size ", bytes, ", max ", bytes_threshold);
         return false;
@@ -147,6 +162,11 @@ bool ccl_fusion_manager::can_fuse(ccl_master_sched* sched) {
         return false;
     }
 
+    if (sched->coll_param.deps.size()) {
+        LOG_DEBUG("can't fuse due to deps size ", sched->coll_param.deps.size());
+        return false;
+    }
+
     if (sched->coll_attr.prologue_fn || sched->coll_attr.epilogue_fn ||
         sched->coll_attr.reduction_fn || sched->coll_attr.synchronous) {
         LOG_DEBUG("can't fuse due to unexpected fields in coll_attr");
@@ -165,8 +185,11 @@ bool ccl_fusion_manager::add(ccl_master_sched* sched) {
     CCL_THROW_IF_NOT(sched->is_completed(), "incorrect completion counter");
     sched->set_counter(1);
 
-    std::lock_guard<ccl_fusion_lock_t> lock{ guard };
-    postponed_queue.push_back(sched);
+    {
+        std::lock_guard<ccl_fusion_lock_t> lock{ guard };
+        postponed_queue.push_back(sched);
+    }
+
     return true;
 }
 
@@ -223,6 +246,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
                 coll_param.dtype = dtype;
                 coll_param.reduction = reduction;
                 coll_param.comm = comm;
+                coll_param.stream = nullptr;
                 sched = new ccl_master_sched(coll_param);
                 sched->internal_type = ccl_sched_internal_fusion;
             } break;
@@ -269,7 +293,11 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
 
     CCL_THROW_IF_NOT(sched);
 
-    tracked_scheds.push_back(sched);
+    {
+        std::lock_guard<ccl_fusion_lock_t> lock{ guard };
+        tracked_scheds.push_back(sched);
+    }
+
     sched->coll_attr.priority = max_priority;
     sched->coll_attr.to_cache = use_cache;
 
@@ -312,9 +340,10 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
             size_t global_copy_idx = idx * copies_per_part + copy_idx;
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
+                entry_factory::make_entry<sycl_copy_entry>(
                     part_scheds[idx].get(),
-                    ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_send_buf),
+                    copy_direction::d2h,
+                    ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.device_send_buf),
                                exec_queue[global_copy_idx]->coll_param.count * dtype_size,
                                ccl_buffer_type::INDIRECT),
                     ccl_buffer(fusion_buf, buf_cache.get_buf_size(), offset),
@@ -349,10 +378,11 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
             size_t global_copy_idx = idx * copies_per_part + copy_idx;
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
+                entry_factory::make_entry<sycl_copy_entry>(
                     part_scheds[idx].get(),
+                    copy_direction::h2d,
                     ccl_buffer(fusion_buf, buf_cache.get_buf_size(), offset),
-                    ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_recv_buf),
+                    ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.device_recv_buf),
                                exec_queue[global_copy_idx]->coll_param.count * dtype_size,
                                ccl_buffer_type::INDIRECT),
                     exec_queue[global_copy_idx]->coll_param.count,
@@ -369,6 +399,8 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
                     exec_queue[global_copy_idx]->coll_param.count,
                     dtype);
 
+            part_scheds[idx]->add_barrier();
+
             offset += exec_queue[global_copy_idx]->coll_param.count * dtype_size;
             entry_factory::make_entry<function_entry>(
                 part_scheds[idx].get(), complete_user_request, exec_queue[global_copy_idx]);
@@ -377,11 +409,12 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
         }
     }
 
+    sched->sync_partial_scheds();
+
     if (use_cache) {
         part_scheds[0]->set_finalize_fn(release_fusion_buf_for_cached_sched, fusion_buf);
     }
     else {
-        sched->sync_partial_scheds();
         entry_factory::make_entry<function_entry>(
             part_scheds[0].get(), release_fusion_buf, fusion_buf);
     }
@@ -412,6 +445,7 @@ void ccl_fusion_manager::execute() {
             }
         }
     }
+
     /* separate block to reduce lock scope */
     {
         std::lock_guard<ccl_fusion_lock_t> lock{ guard };
@@ -490,6 +524,7 @@ void ccl_fusion_manager::clear_exec_queue() {
 }
 
 void ccl_fusion_manager::check_tracked_scheds(bool force_release) {
+    std::lock_guard<ccl_fusion_lock_t> lock{ guard };
     for (auto it = tracked_scheds.begin(); it != tracked_scheds.end();) {
         ccl_master_sched* sched = *it;
         if (sched->is_completed() && (!sched->coll_attr.to_cache || force_release)) {
diff --git a/src/fusion/fusion.hpp b/src/fusion/fusion.hpp
index ed5dccb24..ff102ee5b 100644
--- a/src/fusion/fusion.hpp
+++ b/src/fusion/fusion.hpp
@@ -60,6 +60,8 @@ class ccl_fusion_manager {
     ccl_fusion_manager(const ccl_fusion_manager& other) = delete;
     ccl_fusion_manager& operator=(const ccl_fusion_manager& other) = delete;
 
+    void reset();
+    bool can_reset();
     bool can_fuse(ccl_master_sched* sched);
     bool add(ccl_master_sched* sched);
     void execute();
diff --git a/src/hwloc/hwloc_wrapper.c b/src/hwloc/hwloc_wrapper.c
new file mode 100644
index 000000000..5b0601743
--- /dev/null
+++ b/src/hwloc/hwloc_wrapper.c
@@ -0,0 +1,93 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "hwloc_wrapper.h"
+
+static hwloc_info_t hwloc_info = { .initialized = 0 };
+
+hwloc_status_t hwloc_init() {
+    hwloc_status_t ret = HWLOC_SUCCESS;
+
+    hwloc_info.initialized = 0;
+    hwloc_info.bindset = hwloc_bitmap_alloc();
+
+    if (hwloc_topology_init(&hwloc_info.topology) < 0) {
+        printf("hwloc_topology_init failed (%s)\n", strerror(errno));
+        goto err;
+    }
+
+    hwloc_topology_set_io_types_filter(hwloc_info.topology, HWLOC_TYPE_FILTER_KEEP_ALL);
+
+    if (hwloc_topology_load(hwloc_info.topology) < 0) {
+        printf("hwloc_topology_load failed (%s)\n", strerror(errno));
+        goto err;
+    }
+
+    if (hwloc_get_proc_cpubind(
+            hwloc_info.topology, getpid(), hwloc_info.bindset, HWLOC_CPUBIND_PROCESS) < 0) {
+        printf("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno));
+        goto err;
+    }
+
+    hwloc_info.initialized = 1;
+
+    return ret;
+
+err:
+    return HWLOC_FAILURE;
+}
+
+hwloc_status_t hwloc_finalize() {
+    hwloc_status_t ret = HWLOC_SUCCESS;
+
+    hwloc_topology_destroy(hwloc_info.topology);
+    hwloc_bitmap_free(hwloc_info.bindset);
+    hwloc_info.initialized = 0;
+
+    return ret;
+}
+
+int hwloc_is_initialized() {
+    return hwloc_info.initialized;
+}
+
+static hwloc_obj_t hwloc_get_first_non_io_obj_by_pci(int domain, int bus, int dev, int func) {
+    hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(hwloc_info.topology, domain, bus, dev, func);
+    HWLOC_ASSERT(io_device,
+                 "failed to get PCI device with domain %d, bus %d, dev %d, func %d",
+                 domain,
+                 bus,
+                 dev,
+                 func);
+    hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(hwloc_info.topology, io_device);
+    HWLOC_ASSERT(first_non_io, "failed to get ancestor of PCI device");
+    return first_non_io;
+}
+
+int hwloc_is_dev_close_by_pci(int domain, int bus, int dev, int func) {
+    int is_close = 0;
+
+    if (!hwloc_is_initialized())
+        return is_close;
+
+    hwloc_obj_t first_non_io = hwloc_get_first_non_io_obj_by_pci(domain, bus, dev, func);
+
+    /* determine if PCI device is "close" to process by checking if process's affinity is included
+     * in PCI device's affinity or if PCI device's affinity is included in process's affinity */
+    is_close = (hwloc_bitmap_isincluded(hwloc_info.bindset, first_non_io->cpuset) ||
+                hwloc_bitmap_isincluded(first_non_io->cpuset, hwloc_info.bindset));
+
+    return is_close;
+}
diff --git a/src/hwloc/hwloc_wrapper.h b/src/hwloc/hwloc_wrapper.h
new file mode 100644
index 000000000..7b7ff7b9d
--- /dev/null
+++ b/src/hwloc/hwloc_wrapper.h
@@ -0,0 +1,73 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifndef HWLOC_WRAPPER_H
+#define HWLOC_WRAPPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "hwloc.h"
+#include <sys/syscall.h>
+
+#define GETTID() syscall(SYS_gettid)
+
+#define HWLOC_ASSERT(cond, fmt, ...) \
+    do { \
+        if (!(cond)) { \
+            fprintf(stderr, \
+                    "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \
+                    GETTID(), \
+                    __FILE__, \
+                    __FUNCTION__, \
+                    __LINE__, \
+                    #cond, \
+                    ##__VA_ARGS__); \
+            fflush(stderr); \
+        } \
+    } while (0)
+
+typedef enum { HWLOC_SUCCESS, HWLOC_FAILURE, HWLOC_UNSUPPORTED } hwloc_status_t;
+
+inline const char* hwloc_status_to_str(hwloc_status_t status) {
+    switch (status) {
+        case HWLOC_SUCCESS: return "SUCCESS";
+        case HWLOC_FAILURE: return "FAILURE";
+        case HWLOC_UNSUPPORTED: return "UNSUPPORTED";
+        default: return "UNKNOWN";
+    }
+}
+
+typedef struct {
+    hwloc_topology_t topology;
+    hwloc_cpuset_t bindset;
+    int initialized;
+} hwloc_info_t;
+
+hwloc_status_t hwloc_init();
+hwloc_status_t hwloc_finalize();
+int hwloc_is_initialized();
+
+/*
+ * return true if pci device is close to this process
+ */
+int hwloc_is_dev_close_by_pci(int domain, int bus, int dev, int func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HWLOC_WRAPPER_H */
diff --git a/src/kernels/a2a_helpers.h b/src/kernels/a2a_helpers.h
new file mode 100644
index 000000000..10c44e398
--- /dev/null
+++ b/src/kernels/a2a_helpers.h
@@ -0,0 +1,38 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common.h"
+
+#define DEFINE_A2A_COMM_DATA(NAME, T) \
+    typedef struct __attribute__((packed)) a2a_gpu_comm_data_##NAME { \
+        __global T* recv_buf; \
+        __global sync_flag_type* ready_to_receive_flag; \
+        __global sync_flag_type* data_sent_flag; \
+    } a2a_gpu_comm_data_##NAME;
+
+DEFINE_A2A_COMM_DATA(int8, int8_t)
+DEFINE_A2A_COMM_DATA(uint8, uint8_t)
+DEFINE_A2A_COMM_DATA(int16, int16_t)
+DEFINE_A2A_COMM_DATA(uint16, uint16_t)
+DEFINE_A2A_COMM_DATA(int32, int32_t)
+DEFINE_A2A_COMM_DATA(uint32, uint32_t)
+DEFINE_A2A_COMM_DATA(int64, int64_t)
+DEFINE_A2A_COMM_DATA(uint64, uint64_t)
+//DEFINE_A2A_COMM_DATA(float16, half)
+DEFINE_A2A_COMM_DATA(float32, float)
+DEFINE_A2A_COMM_DATA(float64, double)
+DEFINE_A2A_COMM_DATA(bfloat16, uint16_t)
diff --git a/src/kernels/common.h b/src/kernels/common.h
new file mode 100644
index 000000000..493e2c5c1
--- /dev/null
+++ b/src/kernels/common.h
@@ -0,0 +1,287 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifdef HOST_CTX
+
+#define __global
+using namespace ccl;
+#include <cstdint>
+
+#ifdef ENABLE_KERNEL_ATOMICS
+// type for sync flags for atomics support
+typedef atomic_int sync_flag_type;
+#else
+// default type for sync flags
+typedef volatile int sync_flag_type;
+#endif /* ENABLE_KERNEL_ATOMICS */
+
+#else /* HOST_CTX */
+
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#include "lp.h"
+
+#define FORMAT_int8_t  "%hhd"
+#define FORMAT_int16_t "%d"
+#define FORMAT_int32_t "%d"
+#define FORMAT_int64_t "%ld"
+
+#define FORMAT_uint8_t  "%hhu"
+#define FORMAT_uint16_t "%u"
+#define FORMAT_uint32_t "%u"
+#define FORMAT_uint64_t "%lu"
+
+#define FORMAT_float  "%f"
+#define FORMAT_double "%f"
+
+#define FORMAT_ushort "%u"
+#define FORMAT_half   "%f"
+
+#define FORMAT_4(format) #format ", " #format ", " #format ", " #format
+#define FORMAT_char4     FORMAT_4(% hhd)
+#define FORMAT_uchar4    FORMAT_4(% hhu)
+#define FORMAT_short4    FORMAT_4(% d)
+#define FORMAT_ushort4   FORMAT_4(% u)
+#define FORMAT_int4      FORMAT_4(% d)
+#define FORMAT_uint4     FORMAT_4(% u)
+#define FORMAT_long4     FORMAT_4(% ld)
+#define FORMAT_ulong4    FORMAT_4(% lu)
+#define FORMAT_float4    FORMAT_4(% f)
+#define FORMAT_double4   FORMAT_4(% f)
+
+#define ELEMENTS_1(X) X
+#define ELEMENTS_4(X) (X)[0], (X)[1], (X)[2], (X)[3]
+
+// define aliases for OpenCL types
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+typedef long int64_t;
+typedef ulong uint64_t;
+typedef half float16_t;
+typedef float float32_t;
+typedef double float64_t;
+typedef ushort bfloat16;
+
+#define DEFINE_SUM_OP(T) \
+    T __sum_##T(T lhs, T rhs) { \
+        return lhs + rhs; \
+    }
+
+#define DEFINE_PROD_OP(T) \
+    T __prod_##T(T lhs, T rhs) { \
+        return lhs * rhs; \
+    }
+
+#define DEFINE_MIN_OP(T) \
+    T __min_##T(T lhs, T rhs) { \
+        return min(lhs, rhs); \
+    }
+
+#define DEFINE_MAX_OP(T) \
+    T __max_##T(T lhs, T rhs) { \
+        return max(lhs, rhs); \
+    }
+
+#ifdef ENABLE_KERNEL_DEBUG
+#define DEBUG_BLOCK(block) block
+#else
+#define DEBUG_BLOCK(block)
+#endif
+
+#ifdef ENABLE_KERNEL_DEBUG
+#define LOG_INPUT_DATA_START(rank)    printf("kernel %d, wait income data\n", rank)
+#define LOG_INPUT_DATA_END(rank)      printf("kernel %d, received data\n", rank)
+#define LOG_OUTGOING_DATA_START(rank) printf("kernel %d, wait signal to send\n", rank)
+#define LOG_OUTGOING_DATA_END(rank)   printf("kernel %d, received signal to send\n", rank)
+#define LOG_SEND_PROGRESS(rank, thread_id, flag, desired) \
+    printf("kernel %d.%d, send %d/%d\n", rank, thread_id, flag, desired)
+#define LOG_BARRIER_PASSED(rank, thread_id) \
+    printf("kernel %d.%d barrier passed\n", rank, thread_id);
+#define LOG_IN_BARRIER(rank, thread_id, flag, desired) \
+    printf("kernel %d.%d barrier %d/%d\n", rank, thread_id, flag, desired);
+#else /* ENABLE_KERNEL_DEBUG */
+#define LOG_INPUT_DATA_START(rank)
+#define LOG_INPUT_DATA_END(rank)
+#define LOG_OUTGOING_DATA_START(rank)
+#define LOG_OUTGOING_DATA_END(rank)
+#define LOG_BARRIER_PASSED(rank, thread_id)
+#define LOG_IN_BARRIER(rank, thread_id, flag, desired)
+#endif /* ENABLE_KERNEL_DEBUG */
+
+#define SWAP_VARIABLES(var1, var2, type) \
+    do { \
+        type tmp; \
+        tmp = var1; \
+        var1 = var2; \
+        var2 = tmp; \
+    } while (0);
+
+int get_left_rank(int rank, int comm_size) {
+    return rank == 0 ? comm_size - 1 : rank - 1;
+}
+
+int get_right_rank(int rank, int comm_size) {
+    return rank == (comm_size - 1) ? 0 : rank + 1;
+}
+
+#ifdef ENABLE_KERNEL_ATOMICS
+
+// type for sync flags for atomics support
+typedef atomic_int sync_flag_type;
+
+#define PUT_READY_TO_RECEIVE(_sync_flag) \
+    if (thread_id == 0) { \
+        atomic_fetch_add_explicit( \
+            _sync_flag, 1, memory_order_seq_cst, memory_scope_all_svm_devices); \
+    }
+
+#define I_SENT(_sync_flag) \
+    if (thread_id == 0) { \
+        atomic_fetch_add_explicit( \
+            _sync_flag, 1, memory_order_seq_cst, memory_scope_all_svm_devices); \
+    }
+
+#define WAIT_INPUT_DATA(_sync_flag, _desired) \
+    if (thread_id == 0) { \
+        LOG_INPUT_DATA_START(my_rank); \
+        while (1) { \
+            int _old_value = atomic_load_explicit( \
+                _sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices); \
+            if (_old_value == _desired) { \
+                LOG_INPUT_DATA_END(my_rank); \
+                ++_desired; \
+                break; \
+            } \
+        } \
+    }
+
+#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \
+    if (thread_id == 0) { \
+        LOG_OUTGOING_DATA_START(my_rank); \
+        while (_desired != atomic_load_explicit( \
+                               _sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices)) { \
+        } \
+        LOG_OUTGOING_DATA_END(my_rank); \
+        ++_desired; \
+    }
+
+#define SET_PROXY_SIZE(_sync_flag, size) \
+    if (thread_id == 0) { \
+        atomic_store_explicit( \
+            _sync_flag, size, memory_order_seq_cst, memory_scope_all_svm_devices); \
+    }
+
+#define GET_PROXY_SIZE(_sync_flag, size) \
+    size = atomic_load_explicit(_sync_flag, memory_order_seq_cst, memory_scope_all_svm_devices);
+
+#else /* ENABLE_KERNEL_ATOMICS */
+
+// default type for sync flags
+typedef volatile int sync_flag_type;
+
+#define PUT_READY_TO_RECEIVE(_sync_flag) \
+    if (thread_id == 0) { \
+        (*_sync_flag)++; \
+    }
+
+#define I_SENT(_sync_flag) \
+    if (thread_id == 0) { \
+        (*_sync_flag)++; \
+    }
+
+#define WAIT_INPUT_DATA(_sync_flag, _desired) \
+    if (thread_id == 0) { \
+        LOG_INPUT_DATA_START(my_rank); \
+        while (1) { \
+            if (*_sync_flag == _desired) { \
+                LOG_INPUT_DATA_END(my_rank); \
+                ++_desired; \
+                break; \
+            } \
+        } \
+    }
+
+#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \
+    if (thread_id == 0) { \
+        LOG_OUTGOING_DATA_START(my_rank); \
+        while (_desired != *_sync_flag) { \
+        }; \
+        LOG_OUTGOING_DATA_END(my_rank); \
+        ++_desired; \
+    }
+
+#define SET_PROXY_SIZE(_sync_flag, size) \
+    if (thread_id == 0) { \
+        *_sync_flag = size; \
+    }
+
+#define GET_PROXY_SIZE(_sync_flag, size) size = *_sync_flag;
+
+#endif /* ENABLE_KERNEL_ATOMICS */
+
+/*
+#define KERNEL_BARRIER(_barrier_flag, _desired, _increment)                         \
+    do                                                                              \
+    {                                                                               \
+        int _barrier_value = atomic_add(_barrier_flag, 0);                          \
+        atomic_inc(_barrier_flag);                                                  \
+        int _old_value = _barrier_value;                                            \
+        while(1)                                                                    \
+        {                                                                           \
+            / *thread that last reached the barrier will reset it                    \
+              other threads may expect to receive _desired value while it can be 0  \
+              check if received value is less than initially received* /             \
+            if(_old_value == _desired || _old_value < _barrier_value)               \
+            {                                                                       \
+                BARRIER_PASSED(my_rank, thread_id);                                 \
+                break;                                                              \
+            }                                                                       \
+            IN_BARRIER(my_rank, thread_id, _old_value, _desired);                   \
+            _old_value = atomic_add(_barrier_flag, 0);                \
+        }                                                                           \
+    } while (0);
+*/
+
+/* for A2A */
+/*#define WAIT_INPUT_DATA(_sync_flag, _desired) \
+    if (local_thread_id == 0) { \
+        LOG_INPUT_DATA_START(rank_id); \
+        while (1) { \
+            int _old_value = atomic_cmpxchg(_sync_flag, _desired, _desired); \
+            if (_old_value == _desired) { \
+                LOG_INPUT_DATA_END(rank_id); \
+                _desired += 1 + comm_size; \
+                break; \
+            } \
+        } \
+    }
+
+#define WAIT_SIGNAL_TO_SEND(_sync_flag, _desired) \
+    if (local_thread_id == 0) { \
+        LOG_OUTGOING_DATA_START(rank_id); \
+        while (_desired != atomic_cmpxchg(_sync_flag, _desired, _desired)) { \
+        }; \
+        LOG_OUTGOING_DATA_END(rank_id); \
+        _desired += comm_size; \
+    }*/
+
+#endif /* HOST_CTX */
diff --git a/src/kernels/event_declaration.h b/src/kernels/event_declaration.h
new file mode 100644
index 000000000..b468d8a76
--- /dev/null
+++ b/src/kernels/event_declaration.h
@@ -0,0 +1,41 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifdef HOST_CTX
+#define __global
+
+#include <memory>
+using namespace ccl;
+
+template <class native_type>
+struct shared_event_traits {};
+
+#else
+typedef ushort bfloat16;
+#endif
+
+typedef struct __attribute__((packed)) shared_event_float {
+    __global int* produced_bytes;
+    __global float* mem_chunk;
+} shared_event_float;
+
+#ifdef HOST_CTX
+
+template <>
+struct shared_event_traits<float> {
+    using impl_t = shared_event_float;
+};
+
+#endif
diff --git a/src/kernels/lp.h b/src/kernels/lp.h
new file mode 100644
index 000000000..e28ea1d13
--- /dev/null
+++ b/src/kernels/lp.h
@@ -0,0 +1,162 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef CCL_BF16_GPU_TRUNCATE
+float __bf16_to_fp32(ushort V) {
+    uint temp = convert_uint(V) << 16;
+    return as_float(temp);
+}
+
+ushort __fp32_to_bf16(float V) {
+    ushort2 temp = as_ushort2(V);
+    return temp.s1;
+}
+#else /* CCL_BF16_GPU_TRUNCATE */
+#include "rne.h"
+#endif /* CCL_BF16_GPU_TRUNCATE */
+
+#define DEFINE_BF16SUM_OP(T) \
+    T __bf16_sum_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(__bf16_to_fp32(lhs) + __bf16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_BF16PROD_OP(T) \
+    T __bf16_prod_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(__bf16_to_fp32(lhs) * __bf16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_BF16MIN_OP(T) \
+    T __bf16_min_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(min(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
+    }
+
+#define DEFINE_BF16MAX_OP(T) \
+    T __bf16_max_##T(T lhs, T rhs) { \
+        return __fp32_to_bf16(max(__bf16_to_fp32(lhs), __bf16_to_fp32(rhs))); \
+    }
+
+#ifdef CCL_FP16_GPU_TRUNCATE
+/*
+Truncation routines for converting fp32 <-> fp16
+
+fp16 has 1 sign bit, 5 exponent bits and 10 significand bits with exponent
+offset 15 - https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+
+For fp16 -> fp32
+
+The sign & significand bits are unchanged, but the exponent must be properly
+re-offset (i.e. convert the fp16 offset -> fp32 offset). Care must also be taken
+to saturate the fp32 result if the fp16 result is saturated. Denormals must be
+flushed to 0.
+
+For fp32 -> fp16
+
+Similar to fp16 -> fp32 except that the exponent must be checked for saturation
+since the range of the exponent is signficantly smaller than that of fp32.
+*/
+float __fp16_to_fp32(half V) {
+    uint ans_bits = 0;
+    uint exp_bits = as_ushort(V) & 0x7C00;
+    uint significand_bits = as_ushort(V) & 0x03FF;
+    if (exp_bits == 0x7C00) {
+        ans_bits = ((as_ushort(V) & 0x8000) << 16) | 0x7F800000 | (significand_bits << 13);
+    }
+    else if (exp_bits == 0x0000) {
+        if (significand_bits != 0x00000000) {
+            ans_bits = ((as_ushort(V) & 0x8000) << 16);
+        }
+        else {
+            ans_bits = ((as_ushort(V) & 0x8000) << 16) | (significand_bits << 13);
+        }
+    }
+    else {
+        ans_bits = ((as_ushort(V) & 0x8000) << 16) | ((exp_bits + 0x1C000) << 13) |
+                   (significand_bits << 13);
+    }
+    return as_float(ans_bits);
+}
+
+half __fp32_to_fp16(float V) {
+    ushort ans;
+    uint exp_bits = (as_uint(V) & 0x7F800000);
+    uint significand_bits = (as_uint(V) & 0x007FFFFF);
+    if (exp_bits == 0x00000000) {
+        ans = (as_uint(V) & 0x80000000) >> 16;
+    }
+    else if (exp_bits == 0x7F800000) {
+        if (significand_bits != 0) {
+            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C01;
+        }
+        else {
+            ans = ((as_uint(V) & 0x80000000) >> 16) | 0x00007C00;
+        }
+    }
+    else if (exp_bits < 0x38800000) {
+        ans = 0xFC00;
+    }
+    else if (exp_bits > 0x47000000) {
+        ans = 0x7C00;
+    }
+    else {
+        ans = ((as_uint(V) & 0x80000000) >> 16) |
+              ((((as_uint(V) & 0x7F800000) >> 23) - 112) << 10) | ((as_uint(V) & 0x007FFFFF) >> 13);
+    }
+    return as_half(ans);
+}
+
+#define DEFINE_FP16SUM_OP(T) \
+    T __sum_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(__fp16_to_fp32(lhs) + __fp16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_FP16PROD_OP(T) \
+    T __prod_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(__fp16_to_fp32(lhs) * __fp16_to_fp32(rhs)); \
+    }
+
+#define DEFINE_FP16MIN_OP(T) \
+    T __min_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(min(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
+    }
+
+#define DEFINE_FP16MAX_OP(T) \
+    T __max_##T(T lhs, T rhs) { \
+        return __fp32_to_fp16(max(__fp16_to_fp32(lhs), __fp16_to_fp32(rhs))); \
+    }
+#else /* CCL_FP16_GPU_TRUNCATE */
+#define DEFINE_FP16SUM_OP(T) \
+    T __sum_##T(T lhs, T rhs) { \
+        return lhs + rhs; \
+    }
+
+#define DEFINE_FP16PROD_OP(T) \
+    T __prod_##T(T lhs, T rhs) { \
+        return lhs * rhs; \
+    }
+
+#define DEFINE_FP16MIN_OP(T) \
+    T __min_##T(T lhs, T rhs) { \
+        return min(lhs, rhs); \
+    }
+
+#define DEFINE_FP16MAX_OP(T) \
+    T __max_##T(T lhs, T rhs) { \
+        return max(lhs, rhs); \
+    }
+#endif /* CCL_FP16_GPU_TRUNCATE */
diff --git a/src/kernels/rne.h b/src/kernels/rne.h
new file mode 100644
index 000000000..47ca9bf78
--- /dev/null
+++ b/src/kernels/rne.h
@@ -0,0 +1,51 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifndef RNE_H
+#define RNE_H
+
+// bf <--> float conversion
+//    bf : no igc type for bf yet. Use short as *opaque* type for it.
+//
+// float -> bf conversion builtins (rte rounding mode)
+short __builtin_IB_ftobf_1(float a) __attribute__((const));
+short2 __builtin_IB_ftobf_2(float2 a) __attribute__((const));
+short4 __builtin_IB_ftobf_4(float4 a) __attribute__((const));
+short8 __builtin_IB_ftobf_8(float8 a) __attribute__((const));
+short16 __builtin_IB_ftobf_16(float16 a) __attribute__((const));
+
+// bf -> float conversion builtins (precise conversion)
+float __builtin_IB_bftof_1(short a) __attribute__((const));
+float2 __builtin_IB_bftof_2(short2 a) __attribute__((const));
+float4 __builtin_IB_bftof_4(short4 a) __attribute__((const));
+float8 __builtin_IB_bftof_8(short8 a) __attribute__((const));
+float16 __builtin_IB_bftof_16(short16 a) __attribute__((const));
+
+// 2 floats --> packed 2 bf (rte rounding mode)
+int __builtin_IB_2fto2bf_1(float a, float b) __attribute__((const));
+int2 __builtin_IB_2fto2bf_2(float2 a, float2 b) __attribute__((const));
+int4 __builtin_IB_2fto2bf_4(float4 a, float4 b) __attribute__((const));
+int8 __builtin_IB_2fto2bf_8(float8 a, float8 b) __attribute__((const));
+int16 __builtin_IB_2fto2bf_16(float16 a, float16 b) __attribute__((const));
+
+float __bf16_to_fp32(ushort V) {
+    return __builtin_IB_bftof_1(as_short(V));
+}
+
+ushort __fp32_to_bf16(float V) {
+    return as_ushort(__builtin_IB_ftobf_1(V));
+}
+
+#endif /* RNE_H */
diff --git a/src/kernels/shared.h b/src/kernels/shared.h
new file mode 100644
index 000000000..3dce51e12
--- /dev/null
+++ b/src/kernels/shared.h
@@ -0,0 +1,71 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifndef SHARED_H
+#define SHARED_H
+
+// Defines values and functions shared on both host and device
+// Constants VEC_SIZe is defined as a macro since the value is used by the kernel code at compile-time
+
+// Allgatherv
+
+#define RING_ALLGATHERV_VEC_SIZE 1
+
+// Allreduce
+#define RING_ALLREDUCE_VEC_SIZE 1
+
+static inline size_t ring_allreduce_get_segment_size(size_t elems_count, size_t comm_size) {
+    elems_count /= RING_ALLREDUCE_VEC_SIZE;
+    return (elems_count + comm_size - 1) / comm_size;
+}
+
+static inline size_t ring_allreduce_get_tmp_buffer_size(size_t elems_count, size_t comm_size) {
+    // The algorithm uses at most 2 * segment_size elements of tmp_buffer in order to store
+    // temporal data
+    return 2 * ring_allreduce_get_segment_size(elems_count, comm_size);
+}
+
+// Bcast
+
+#define RING_BCAST_VEC_SIZE 1
+
+// Reduce
+
+#define RING_REDUCE_VEC_SIZE 1
+
+static inline size_t ring_reduce_tmp_buffer_size(size_t elems_count, size_t comm_size) {
+    (void)comm_size;
+    return elems_count;
+}
+
+// Reduce-scatter
+
+#define RING_REDUCE_SCATTER_VEC_SIZE 1
+
+static inline size_t ring_reduce_scatter_get_segment_size(size_t recv_count, size_t comm_size) {
+    (void)
+        comm_size; // C disallows unnamed parameters in function signature, so use a nammed one and simply suppress it
+    // Our segment size siply equal to recv_count parameter with respect to vector size
+    recv_count /= RING_REDUCE_SCATTER_VEC_SIZE;
+    return recv_count;
+}
+
+static inline size_t ring_reduce_scatter_tmp_buffer_size(size_t elems_count, size_t comm_size) {
+    // The algorithm uses at most 2 * segment_size elements of tmp_buffer in order to store
+    // temporal data
+    return 2 * ring_reduce_scatter_get_segment_size(elems_count, comm_size);
+}
+
+#endif /* SHARED_H */
diff --git a/src/native_device_api/interop_utils.cpp b/src/native_device_api/interop_utils.cpp
index 350494f2a..f76230dee 100644
--- a/src/native_device_api/interop_utils.cpp
+++ b/src/native_device_api/interop_utils.cpp
@@ -138,10 +138,10 @@ assoc_result check_assoc_device_memory(const void* mem,
 
 #ifdef CCL_ENABLE_SYCL
 
-    cl::sycl::usm::alloc pointer_type = cl::sycl::get_pointer_type(mem, ctx);
+    sycl::usm::alloc pointer_type = sycl::get_pointer_type(mem, ctx);
 
     using usm_truth_table =
-        std::array<usm_support_mode, utils::enum_to_underlying(cl::sycl::usm::alloc::unknown) + 1>;
+        std::array<usm_support_mode, utils::enum_to_underlying(sycl::usm::alloc::unknown) + 1>;
 
     constexpr int platform_config_count = 4; /*host, cpu, gpu, accel*/
     constexpr std::array<usm_truth_table, platform_config_count> usm_target_table{ {
@@ -177,8 +177,8 @@ assoc_result check_assoc_device_memory(const void* mem,
 
     if (std::get<assoc_result_index::SUPPORT_MODE>(ret) == usm_support_mode::prohibited) {
         std::stringstream ss;
-        ss << "Incompatible USM type requested: " << usm_to_string(pointer_type)
-           << ", for ccl_device: " << std::to_string(platform_type_index);
+        ss << "incompatible usm type requested: " << usm_to_string(pointer_type)
+           << " for device: " << std::to_string(platform_type_index);
         std::get<assoc_result_index::ERROR_CAUSE>(ret) = ss.str();
     }
 #else
diff --git a/src/native_device_api/l0/context.cpp b/src/native_device_api/l0/context.cpp
index b0f03d419..d2f38b85f 100644
--- a/src/native_device_api/l0/context.cpp
+++ b/src/native_device_api/l0/context.cpp
@@ -17,6 +17,7 @@
 #include "oneapi/ccl/native_device_api/l0/context.hpp"
 #include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
 #include "oneapi/ccl/native_device_api/l0/device.hpp"
+#include "oneapi/ccl/native_device_api/l0/event_pool.hpp"
 #include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp"
 #include "oneapi/ccl/native_device_api/l0/driver.hpp"
 #include "oneapi/ccl/native_device_api/l0/platform.hpp"
@@ -27,6 +28,8 @@ namespace native {
 ccl_context::ccl_context(handle_t h, owner_ptr_t&& platform)
         : base(h, std::move(platform), std::weak_ptr<ccl_context>{}) {}
 
+ccl_context::~ccl_context() {}
+
 CCL_BE_API const ze_host_mem_alloc_desc_t& ccl_context::get_default_host_alloc_desc() {
     static const ze_host_mem_alloc_desc_t common{
         .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
@@ -57,6 +60,52 @@ CCL_BE_API void ccl_context::host_free_memory(void* mem_handle) {
     }
 }
 
+CCL_BE_API ccl_context::ccl_event_pool_ptr ccl_context::create_event_pool(
+    std::initializer_list<ccl_device*> devices,
+    const ze_event_pool_desc_t& descr) {
+    if (!pool_holder) {
+        pool_holder.reset(new ccl_event_pool_holder);
+    }
+
+    ze_event_pool_handle_t pool = nullptr;
+
+    std::vector<ccl_device::handle_t> device_handles(devices.size());
+    for (ccl_device* d : devices) {
+        device_handles.push_back(d->get());
+    }
+    ze_result_t status =
+        zeEventPoolCreate(get(),
+                          &descr,
+                          devices.size(),
+                          (device_handles.empty() ? nullptr : device_handles.data()),
+                          &pool);
+    if (status != ZE_RESULT_SUCCESS) {
+        CCL_THROW("zeEventPoolCreate, error: " + native::to_string(status));
+    }
+
+    std::shared_ptr<ccl_event_pool> pool_ptr =
+        std::make_shared<ccl_event_pool>(descr, pool, pool_holder, get_ptr());
+    return pool_holder->emplace(devices, pool_ptr);
+}
+
+CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> ccl_context::get_shared_event_pool(
+    std::initializer_list<ccl_device*> devices) {
+    std::vector<std::shared_ptr<ccl_event_pool>> ret;
+    if (pool_holder) {
+        ret = pool_holder->get_event_pool_storage(devices);
+    }
+    return ret;
+}
+
+CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>> ccl_context::get_shared_event_pool(
+    std::initializer_list<ccl_device*> devices) const {
+    std::vector<std::shared_ptr<ccl_event_pool>> ret;
+    if (pool_holder) {
+        ret = pool_holder->get_event_pool_storage(devices);
+    }
+    return ret;
+}
+
 CCL_BE_API std::string ccl_context::to_string() const {
     std::stringstream ss;
     ss << handle;
diff --git a/src/native_device_api/l0/device.cpp b/src/native_device_api/l0/device.cpp
index 93cd6a6b0..201ce63e9 100644
--- a/src/native_device_api/l0/device.cpp
+++ b/src/native_device_api/l0/device.cpp
@@ -396,11 +396,8 @@ CCL_BE_API void* ccl_device::device_alloc_memory(size_t bytes_count,
         ctx = get_default_context();
     }
 
-    ze_result_t
-        ret = //zeDriverAllocSharedMem(get_owner()->handle, handle, flags, ordinal, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr);
-        //zeDriverAllocHostMem(get_owner()->handle, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr);
-        zeMemAllocDevice(
-            ctx->get(), &mem_descr, /*&host_descr, */ bytes_count, alignment, handle, &out_ptr);
+    ze_result_t ret = zeMemAllocDevice(
+        ctx->get(), &mem_descr, /*&host_descr, */ bytes_count, alignment, handle, &out_ptr);
     if (ret != ZE_RESULT_SUCCESS) {
         CCL_THROW("cannot allocate memory, error: " + std::to_string(ret));
     }
@@ -516,10 +513,14 @@ void CCL_BE_API ccl_device::on_delete(ze_ipc_mem_handle_t& ipc_mem_handle,
     */
 
     //todo thread safety
-    for (auto ipc_it = ipc_storage.begin(); ipc_it != ipc_storage.end(); ++ipc_it) {
-        if (!strncmp(ipc_it->second->handle.data, ipc_mem_handle.data, ZE_MAX_IPC_HANDLE_SIZE)) {
-            ipc_storage.erase(ipc_it);
+    for (auto ipc_it = ipc_storage.begin(); ipc_it != ipc_storage.end();) {
+        if (ipc_it->second) {
+            if (!memcmp(ipc_it->second->handle.data, ipc_mem_handle.data, ZE_MAX_IPC_HANDLE_SIZE)) {
+                ipc_it = ipc_storage.erase(ipc_it);
+                continue;
+            }
         }
+        ++ipc_it;
     }
 }
 
@@ -530,7 +531,7 @@ CCL_BE_API ccl_device::device_ipc_memory ccl_device::get_ipc_memory(
     //, this,
     // ", expected device: ", ipc_handle.get_owner());
 
-    ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD;
+    ze_ipc_memory_flags_t flag = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED;
     ip_memory_elem_t ipc_memory{};
 
     if (!ctx) {
@@ -558,7 +559,7 @@ CCL_BE_API std::shared_ptr<ccl_device::device_ipc_memory> ccl_device::restore_sh
     std::shared_ptr<device_ipc_memory_handle>&& ipc_handle,
     std::shared_ptr<ccl_context> ctx) {
     assert(ipc_handle->get_owner().lock().get() == this && "IPC handle doesn't belong to device: ");
-    ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD;
+    ze_ipc_memory_flags_t flag = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED;
     ip_memory_elem_t ipc_memory{};
 
     if (!ctx) {
diff --git a/src/native_device_api/l0/event_pool.cpp b/src/native_device_api/l0/event_pool.cpp
new file mode 100644
index 000000000..c86c065f3
--- /dev/null
+++ b/src/native_device_api/l0/event_pool.cpp
@@ -0,0 +1,167 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/native_device_api/export_api.hpp"
+#include "oneapi/ccl/native_device_api/l0/context.hpp"
+#include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
+#include "oneapi/ccl/native_device_api/l0/device.hpp"
+#include "oneapi/ccl/native_device_api/l0/event_pool.hpp"
+#include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp"
+#include "oneapi/ccl/native_device_api/l0/driver.hpp"
+#include "oneapi/ccl/native_device_api/l0/platform.hpp"
+#include "common/log/log.hpp"
+
+namespace native {
+
+// event pool
+ccl_event_pool::ccl_event_pool(const ze_event_pool_desc_t& descr,
+                               handle_t h,
+                               owner_ptr_t&& holder,
+                               context_ptr_t&& ctx)
+        : base(h, std::move(holder), std::move(ctx)),
+          pool_description(descr),
+          allocated_event_count(0) {}
+
+ccl_event_pool::~ccl_event_pool() {
+    CCL_ASSERT(allocated_event_count.load() == 0,
+               "there are in-use event objects during ccl_event_pool destruction");
+}
+
+const ze_event_pool_desc_t& ccl_event_pool::get_pool_description() const {
+    return pool_description;
+}
+
+size_t ccl_event_pool::get_allocated_events() const {
+    return allocated_event_count.load();
+}
+
+ccl_event_pool::event_ptr ccl_event_pool::create_event(const ze_event_desc_t& descr) {
+    ze_event_handle_t event_handle;
+    ze_result_t ret = zeEventCreate(get(), &descr, &event_handle);
+    if (ret != ZE_RESULT_SUCCESS) {
+        CCL_THROW("cannot execute zeEventCreate, error: " + native::to_string(ret));
+    }
+    event_ptr event_ret(new event(event_handle, get_ptr(), get_ctx()));
+    allocated_event_count.fetch_add(1);
+    return event_ret;
+}
+
+void ccl_event_pool::on_delete(ze_event_handle_t event_handle, ze_context_handle_t& ctx) {
+    (void)ctx;
+    ze_result_t ret = zeEventDestroy(event_handle);
+    if (ret != ZE_RESULT_SUCCESS) {
+        CCL_THROW("cannot execute zeEventDestroy, error: " + native::to_string(ret));
+    }
+
+    allocated_event_count.fetch_sub(1);
+}
+
+// Thread safe array
+CCL_BE_API event_pool_array_t::context_array_accessor event_pool_array_t::access() {
+    return context_array_accessor(m, event_pools);
+}
+
+CCL_BE_API event_pool_array_t::const_context_array_accessor event_pool_array_t::access() const {
+    return const_context_array_accessor(m, event_pools);
+}
+
+// Thread safe context storage holder
+ze_event_pool_handle_t ccl_event_pool_holder::get() {
+    return nullptr;
+}
+
+std::shared_ptr<ccl_event_pool> ccl_event_pool_holder::emplace(
+    const std::initializer_list<ccl_device*>& devices,
+    std::shared_ptr<ccl_event_pool> pool) {
+    std::unique_lock<std::mutex> lock(m); //TODO use shared lock
+
+    if (devices.size() != 0) {
+        for (ccl_device* d : devices) {
+            event_pool_array_t& cont = contexts_pool[d];
+            auto acc = cont.access();
+            acc.get().push_back(pool);
+        }
+    }
+    else {
+        event_pool_array_t& cont = contexts_pool[nullptr];
+        auto acc = cont.access();
+        acc.get().push_back(pool);
+    }
+    return pool;
+}
+
+CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>>
+ccl_event_pool_holder::get_event_pool_storage(std::initializer_list<ccl_device*> devices) {
+    return static_cast<const ccl_event_pool_holder*>(this)->get_event_pool_storage(devices);
+}
+
+CCL_BE_API std::vector<std::shared_ptr<ccl_event_pool>>
+ccl_event_pool_holder::get_event_pool_storage(std::initializer_list<ccl_device*> devices) const {
+    using pool_array = std::vector<std::shared_ptr<ccl_event_pool>>;
+    pool_array shared_pool;
+
+    std::unique_lock<std::mutex> lock(m); //TODO use simple shared lock
+
+    if (devices.size() == 0) {
+        auto it = contexts_pool.find(nullptr);
+        if (it != contexts_pool.end()) {
+            shared_pool = it->second.access().get();
+        }
+    }
+    else {
+        for (ccl_device* d : devices) {
+            auto it = contexts_pool.find(d);
+            if (it == contexts_pool.end()) {
+                CCL_THROW("cannot find event_pool for device: " + d->to_string() +
+                          "\nTotal contexts_pool count: " + std::to_string(contexts_pool.size()));
+            }
+
+            auto acc = it->second.access();
+            auto& event_pools = acc.get();
+
+            //find common pools for devices
+            if (shared_pool.empty()) {
+                // copy
+                shared_pool = event_pools;
+                continue;
+            }
+
+            //find intersection
+            pool_array intersected;
+            std::set_intersection(event_pools.begin(),
+                                  event_pools.end(),
+                                  shared_pool.begin(),
+                                  shared_pool.end(),
+                                  std::back_inserter(intersected));
+            shared_pool.swap(intersected);
+
+            // nothing to do
+            if (shared_pool.empty()) {
+                break;
+            }
+        }
+    }
+    return shared_pool;
+}
+
+void ccl_event_pool_holder::on_delete(ze_event_pool_handle_t pool_handle,
+                                      ze_context_handle_t& ctx) {
+    (void)ctx;
+    ze_result_t ret = zeEventPoolDestroy(pool_handle);
+    if (ret != ZE_RESULT_SUCCESS) {
+        CCL_THROW("cannot execute zeEventPoolDestroy, error: " + native::to_string(ret));
+    }
+}
+} // namespace native
diff --git a/src/native_device_api/l0/primitives.cpp b/src/native_device_api/l0/primitives.cpp
index c9a7a09c4..3387ed48e 100644
--- a/src/native_device_api/l0/primitives.cpp
+++ b/src/native_device_api/l0/primitives.cpp
@@ -30,6 +30,27 @@
 #include "common/log/log.hpp"
 
 namespace native {
+
+bool event::wait(uint64_t nanosec) const {
+    (void)nanosec;
+    ze_result_t ret = zeEventHostSynchronize(handle, nanosec);
+    if (ret != ZE_RESULT_SUCCESS && ret != ZE_RESULT_NOT_READY) {
+        CCL_THROW("zeEventHostSynchronize, error: " + native::to_string(ret));
+    }
+    return ret == ZE_RESULT_SUCCESS;
+}
+
+ze_result_t event::status() const {
+    return zeEventQueryStatus(handle);
+}
+
+void event::signal() {
+    ze_result_t ret = zeEventHostSignal(handle);
+    if (ret != ZE_RESULT_SUCCESS) {
+        CCL_THROW("zeEventHostSignal, error: " + native::to_string(ret));
+    }
+}
+
 namespace detail {
 CCL_BE_API void copy_memory_sync_unsafe(void* dst,
                                         const void* src,
@@ -46,7 +67,7 @@ CCL_BE_API void copy_memory_sync_unsafe(void* dst,
     ze_command_queue_desc_t queue_description = device->get_default_queue_desc();
     //queue_description.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;   TODO may be &= for flags???
 
-    auto queue = device->create_cmd_queue(ctx, queue_description);
+    auto& queue = device->get_cmd_queue(queue_description, ctx);
 
     //create list
     ze_command_list_desc_t list_description = device->get_default_list_desc();
@@ -96,6 +117,28 @@ CCL_BE_API void copy_memory_sync_unsafe(void* dst,
     memcpy(dst, src, size);
 }
 
+/*
+
+event<ccl_device, ccl_context>
+copy_memory_async_unsafe(void* dst,
+                             const void* src,
+                             size_t size,
+                             std::weak_ptr<ccl_device> device_weak,
+                             std::shared_ptr<ccl_context> ctx,
+                             queue<ccl_device, ccl_context>& q);
+event<ccl_device, ccl_context>
+copy_memory_async_unsafe(void* dst,
+                             const void* src,
+                             size_t size,
+                             std::weak_ptr<ccl_context> ctx_weak,
+                             std::shared_ptr<ccl_context> ctx,
+                             queue<ccl_device, ccl_context>& q)
+{
+    (void)q;
+    copy_memory_sync_unsafe(dst, src, size, ctx_weak, ctx);
+    event<ccl_device, ccl_context> e(h, get_ptr(), ctx);
+}
+*/
 } // namespace detail
 
 std::string get_build_log_string(const ze_module_build_log_handle_t& build_log) {
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index 7361df9a6..afb0b5bd9 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #include <algorithm>
+#include <numeric>
 
 #include "coll/selection/selection.hpp"
 #include "common/global/global.hpp"
@@ -82,6 +83,159 @@ ccl::status ccl_parallelizer_prologue_get_dtype(const void* ctx, void* field_ptr
 }
 
 ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
+    process_base(sched);
+
+#ifdef CCL_ENABLE_SYCL
+    ccl_coll_param& coll_param = sched->coll_param;
+    if (coll_param.stream && coll_param.stream->is_sycl_device_stream() &&
+        (coll_param.device_send_buf || coll_param.device_recv_buf)) {
+        process_pre_post_copies(sched);
+    }
+#endif /* CCL_ENABLE_SYCL */
+
+    /* should be the last call in the sequence of process_* calls
+       because it sets dependencies for all partial schedules
+       which already should be filled */
+    process_deps(sched);
+
+    return ccl::status::success;
+}
+
+ccl::status ccl_parallelizer::process_deps(ccl_master_sched* sched) {
+    auto& part_scheds = sched->partial_scheds;
+    ccl_sched* deps_sched = part_scheds[0].get();
+    size_t part_count = part_scheds.size();
+
+    for (size_t idx = 0; idx < part_count; idx++) {
+        part_scheds[idx]->set_add_mode(ccl_sched_add_front);
+    }
+    sched->sync_partial_scheds();
+
+    entry_factory::make_entry<deps_entry>(deps_sched);
+    deps_sched->add_barrier();
+
+    return ccl::status::success;
+}
+
+#ifdef CCL_ENABLE_SYCL
+ccl::status ccl_parallelizer::process_pre_post_copies(ccl_master_sched* sched) {
+    auto& part_scheds = sched->partial_scheds;
+    ccl_sched* copy_sched = part_scheds[0].get();
+    size_t part_count = part_scheds.size();
+
+    ccl_coll_param& coll_param = sched->coll_param;
+    ccl_comm* comm = coll_param.comm;
+    int comm_size = comm->size();
+    int my_rank = comm->rank();
+
+    const ccl_datatype& dtype = coll_param.dtype;
+    size_t dtype_size = dtype.size();
+
+    ccl_coll_type coll_type = coll_param.ctype;
+
+    size_t d2h_bytes = 0, h2d_bytes = 0;
+    size_t d2h_count = 0, h2d_count = 0;
+
+    void* device_in_buf = nullptr;
+    void* device_out_buf = nullptr;
+    void* host_in_buf = nullptr;
+    void* host_out_buf = nullptr;
+
+    size_t device_in_buf_offset = 0;
+
+    switch (coll_type) {
+        case ccl_coll_bcast:
+            if (comm->rank() == coll_param.root)
+                d2h_count = coll_param.count;
+            else
+                d2h_count = 0;
+            h2d_count = coll_param.count;
+            break;
+
+        case ccl_coll_reduce:
+            d2h_count = coll_param.count;
+            if (my_rank == coll_param.root)
+                h2d_count = coll_param.count;
+            else
+                h2d_count = 0;
+            break;
+
+        case ccl_coll_reduce_scatter:
+            d2h_count = coll_param.count * comm_size;
+            h2d_count = coll_param.count;
+            break;
+
+        case ccl_coll_allreduce: d2h_count = h2d_count = coll_param.count; break;
+
+        case ccl_coll_allgatherv:
+            if (coll_param.device_send_buf == coll_param.device_recv_buf) {
+                device_in_buf_offset =
+                    std::accumulate(coll_param.recv_counts, coll_param.recv_counts + my_rank, 0);
+                LOG_TRACE("device_in_buf_offset = ", device_in_buf_offset);
+            }
+            d2h_count = coll_param.send_count;
+            h2d_count =
+                std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0);
+            break;
+
+        case ccl_coll_alltoall: d2h_count = h2d_count = coll_param.count * comm_size; break;
+        case ccl_coll_alltoallv:
+            d2h_count =
+                std::accumulate(coll_param.send_counts, coll_param.send_counts + comm_size, 0);
+            h2d_count =
+                std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0);
+            break;
+
+        default: CCL_FATAL("unexpected coll_type ", coll_type); break;
+    }
+
+    device_in_buf = &(coll_param.device_send_buf);
+    host_in_buf = (void*)coll_param.send_buf;
+    d2h_bytes = d2h_count * dtype_size;
+
+    host_out_buf = coll_param.recv_buf;
+    device_out_buf = &(coll_param.device_recv_buf);
+    h2d_bytes = h2d_count * dtype_size;
+
+    if (d2h_bytes) {
+        for (size_t idx = 0; idx < part_count; idx++) {
+            part_scheds[idx]->set_add_mode(ccl_sched_add_front);
+        }
+        sched->sync_partial_scheds();
+
+        entry_factory::make_entry<sycl_copy_entry>(
+            copy_sched,
+            copy_direction::d2h,
+            ccl_buffer(device_in_buf, d2h_bytes, ccl_buffer_type::INDIRECT),
+            ccl_buffer(host_in_buf, d2h_bytes),
+            d2h_count,
+            dtype,
+            coll_param.stream,
+            device_in_buf_offset);
+    }
+
+    if (h2d_bytes) {
+        for (size_t idx = 0; idx < part_count; idx++) {
+            part_scheds[idx]->set_add_mode(ccl_sched_add_back);
+        }
+        sched->sync_partial_scheds();
+
+        entry_factory::make_entry<sycl_copy_entry>(
+            copy_sched,
+            copy_direction::h2d,
+            ccl_buffer(host_out_buf, h2d_bytes),
+            ccl_buffer(device_out_buf, h2d_bytes, ccl_buffer_type::INDIRECT),
+            h2d_count,
+            dtype,
+            coll_param.stream);
+        part_scheds[0]->add_barrier();
+    }
+
+    return ccl::status::success;
+}
+#endif /* CCL_ENABLE_SYCL */
+
+ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
     /* TODO: split on per-collective classes */
 
     CCL_ASSERT(sched);
@@ -238,6 +392,7 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
         ccl_coll_param part_coll_param{};
         part_coll_param.ctype = sched->coll_param.ctype;
         part_coll_param.dtype = sched->coll_param.dtype;
+        part_coll_param.stream = sched->coll_param.stream;
         part_coll_param.comm = comm;
         sched->add_partial_sched(part_coll_param);
     }
@@ -273,10 +428,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
             if (coll_type == ccl_coll_alltoallv) {
                 CCL_ASSERT(coll_param.send_counts);
                 CCL_ASSERT(coll_param.recv_counts);
-                for (idx = 0; idx < comm_size; idx++) {
-                    a2av_send_count += coll_param.send_counts[idx];
-                    a2av_recv_count += coll_param.recv_counts[idx];
-                }
+                a2av_send_count =
+                    std::accumulate(coll_param.send_counts, coll_param.send_counts + comm_size, 0);
+                a2av_recv_count =
+                    std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0);
             }
             else {
                 a2av_send_count = coll_param.count * comm_size;
@@ -292,19 +447,16 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
             offsets[0] = 0;
             if (ag_algo == ccl_coll_allgatherv_direct || ag_algo == ccl_coll_allgatherv_naive ||
                 ag_algo == ccl_coll_allgatherv_ring) {
-                for (idx = 0; idx < comm_size; idx++)
-                    ag_recv_count += recv_counts[idx];
-                ag_recv_bytes = ag_recv_count * dtype_size;
             }
             else {
-                ag_recv_count = counts[0];
                 for (idx = 1; idx < comm_size; idx++) {
                     counts[idx] = recv_counts[idx];
                     offsets[idx] = offsets[idx - 1] + counts[idx - 1] * dtype_size;
-                    ag_recv_count += counts[idx];
                 }
-                ag_recv_bytes = ag_recv_count * dtype_size;
             }
+            ag_recv_count =
+                std::accumulate(coll_param.recv_counts, coll_param.recv_counts + comm_size, 0);
+            ag_recv_bytes = ag_recv_count * dtype_size;
             break;
         default: CCL_FATAL("unexpected coll_type ", coll_type); break;
     }
@@ -322,71 +474,23 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
             sched->sync_partial_scheds();
             break;
         case ccl_coll_bcast:
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                if (comm->rank() == coll_param.root) {
-                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                        part_scheds[0].get(),
-                        ccl_buffer(&(coll_param.sycl_buf),
-                                   coll_param.count * dtype_size,
-                                   ccl_buffer_type::INDIRECT),
-                        ccl_buffer(coll_param.buf, coll_param.count * dtype_size),
-                        coll_param.count,
-                        dtype,
-                        coll_param.stream);
-                }
-                sched->sync_partial_scheds();
-            }
-#endif /* CCL_ENABLE_SYCL */
             for (idx = 0; idx < part_count; idx++) {
                 ccl_coll_entry_param param{};
                 param.ctype = ccl_coll_bcast;
-                param.buf = ccl_buffer(&(coll_param.buf),
-                                       coll_param.count * dtype_size,
-                                       offsets[idx],
-                                       ccl_buffer_type::INDIRECT);
+                param.recv_buf = ccl_buffer(&(coll_param.recv_buf),
+                                            coll_param.count * dtype_size,
+                                            offsets[idx],
+                                            ccl_buffer_type::INDIRECT);
                 param.count = counts[idx];
                 param.dtype = dtype;
                 param.root = coll_param.root;
                 param.comm = comm;
                 coll_entry_helper::add_coll_entry<ccl_coll_bcast>(part_scheds[idx].get(), param);
             }
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(coll_param.buf, coll_param.count * dtype_size),
-                    ccl_buffer(&(coll_param.sycl_buf),
-                               coll_param.count * dtype_size,
-                               ccl_buffer_type::INDIRECT),
-                    coll_param.count,
-                    dtype,
-                    coll_param.stream);
-            }
-#endif /* CCL_ENABLE_SYCL */
             break;
 
         case ccl_coll_reduce:
             for (idx = 0; idx < part_count; idx++) {
-#ifdef CCL_ENABLE_SYCL
-                /* convert sycl buffer */
-                if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                        part_scheds[0].get(),
-                        ccl_buffer(&(coll_param.sycl_send_buf),
-                                   coll_param.count * dtype_size,
-                                   ccl_buffer_type::INDIRECT),
-                        ccl_buffer((void*)coll_param.send_buf, coll_param.count * dtype_size),
-                        coll_param.count,
-                        dtype,
-                        coll_param.stream);
-                    sched->sync_partial_scheds();
-                }
-#endif /* CCL_ENABLE_SYCL */
-
                 ccl_coll_entry_param param{};
                 param.ctype = ccl_coll_reduce;
                 param.send_buf = ccl_buffer(&(coll_param.send_buf),
@@ -404,44 +508,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                 param.comm = comm;
                 coll_entry_helper::add_coll_entry<ccl_coll_reduce>(part_scheds[idx].get(), param);
             }
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                if (comm->rank() == coll_param.root) {
-                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                        part_scheds[0].get(),
-                        ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
-                        ccl_buffer(&(coll_param.sycl_recv_buf),
-                                   coll_param.count * dtype_size,
-                                   ccl_buffer_type::INDIRECT),
-                        coll_param.count,
-                        dtype,
-                        coll_param.stream);
-                }
-            }
-#endif /* CCL_ENABLE_SYCL */
             break;
 
         case ccl_coll_reduce_scatter:
             for (idx = 0; idx < part_count; idx++) {
-#ifdef CCL_ENABLE_SYCL
-                /* convert sycl buffer */
-                if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                        part_scheds[0].get(),
-                        ccl_buffer(&(coll_param.sycl_send_buf),
-                                   coll_param.count * comm_size * dtype_size,
-                                   ccl_buffer_type::INDIRECT),
-                        ccl_buffer((void*)coll_param.send_buf,
-                                   coll_param.count * comm_size * dtype_size),
-                        coll_param.count * comm_size,
-                        dtype,
-                        coll_param.stream);
-                    sched->sync_partial_scheds();
-                }
-#endif /* CCL_ENABLE_SYCL */
-
                 ccl_coll_entry_param param{};
                 param.ctype = ccl_coll_reduce_scatter;
 
@@ -463,42 +533,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                 coll_entry_helper::add_coll_entry<ccl_coll_reduce_scatter>(part_scheds[idx].get(),
                                                                            param);
             }
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
-                    ccl_buffer(&(coll_param.sycl_recv_buf),
-                               coll_param.count * dtype_size,
-                               ccl_buffer_type::INDIRECT),
-                    coll_param.count,
-                    dtype,
-                    coll_param.stream);
-            }
-#endif /* CCL_ENABLE_SYCL */
             break;
 
         case ccl_coll_allreduce: {
             ccl_parallelizer_prologue_ctx* main_ctx = nullptr;
-
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(&(coll_param.sycl_send_buf),
-                               coll_param.count * dtype_size,
-                               ccl_buffer_type::INDIRECT),
-                    ccl_buffer((void*)coll_param.send_buf, coll_param.count * dtype_size),
-                    coll_param.count,
-                    dtype,
-                    coll_param.stream);
-                sched->sync_partial_scheds();
-            }
-#endif /* CCL_ENABLE_SYCL */
-
             if (coll_attr->prologue_fn) {
                 part_ctxs.reserve(part_count);
 
@@ -622,49 +660,10 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                         ccl_parallelizer_prologue_get_dtype, main_ctx, false);
                 }
             }
-
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
-                    ccl_buffer(&(coll_param.sycl_recv_buf),
-                               coll_param.count * dtype_size,
-                               ccl_buffer_type::INDIRECT),
-                    coll_param.count,
-                    dtype,
-                    coll_param.stream);
-            }
-#endif /* CCL_ENABLE_SYCL */
-
             break;
         }
+
         case ccl_coll_allgatherv: {
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                size_t sycl_buf_offset = 0;
-                if (coll_param.sycl_send_buf == coll_param.sycl_recv_buf) {
-                    for (int i = 0; i < my_rank; i++) {
-                        sycl_buf_offset += coll_param.recv_counts[i];
-                    }
-                    LOG_TRACE("sycl_buf_offset = ", sycl_buf_offset);
-                }
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(&(coll_param.sycl_send_buf),
-                               coll_param.send_count * dtype_size,
-                               ccl_buffer_type::INDIRECT),
-                    ccl_buffer((void*)coll_param.send_buf, coll_param.send_count * dtype_size),
-                    coll_param.send_count,
-                    dtype,
-                    coll_param.stream,
-                    sycl_buf_offset);
-                sched->sync_partial_scheds();
-            }
-#endif /* CCL_ENABLE_SYCL */
             if (ag_algo == ccl_coll_allgatherv_direct || ag_algo == ccl_coll_allgatherv_naive ||
                 ag_algo == ccl_coll_allgatherv_ring) {
                 ccl_coll_entry_param param{};
@@ -776,7 +775,7 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                     for (idx = 0; idx < comm_size; idx++) {
                         ccl_coll_entry_param param{};
                         param.ctype = ccl_coll_bcast;
-                        param.buf = ag_recv_bufs[idx];
+                        param.recv_buf = ag_recv_bufs[idx];
                         param.count = counts[idx];
                         param.dtype = dtype;
                         param.root = idx;
@@ -786,39 +785,11 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                     }
                 }
             }
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(coll_param.recv_buf, ag_recv_bytes),
-                    ccl_buffer(
-                        &(coll_param.sycl_recv_buf), ag_recv_bytes, ccl_buffer_type::INDIRECT),
-                    ag_recv_count,
-                    dtype,
-                    coll_param.stream);
-            }
-#endif /* CCL_ENABLE_SYCL */
             break;
         }
+
         case ccl_coll_alltoall:
         case ccl_coll_alltoallv: {
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(
-                        &(coll_param.sycl_send_buf), a2av_send_bytes, ccl_buffer_type::INDIRECT),
-                    ccl_buffer((void*)coll_param.send_buf, a2av_send_bytes),
-                    a2av_send_count,
-                    dtype,
-                    coll_param.stream);
-                sched->sync_partial_scheds();
-            }
-#endif /* CCL_ENABLE_SYCL */
-
             if (a2a_algo == ccl_coll_alltoall_naive || a2av_algo == ccl_coll_alltoallv_naive) {
                 ccl_coll_build_naive_alltoallv(sched, part_scheds_vector, coll_param);
             }
@@ -852,22 +823,9 @@ ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
                                                                           param);
                 }
             }
-#ifdef CCL_ENABLE_SYCL
-            /* convert sycl buffer */
-            if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
-                    part_scheds[0].get(),
-                    ccl_buffer(coll_param.recv_buf, a2av_recv_bytes),
-                    ccl_buffer(
-                        &(coll_param.sycl_recv_buf), a2av_recv_bytes, ccl_buffer_type::INDIRECT),
-                    a2av_recv_count,
-                    dtype,
-                    coll_param.stream);
-            }
-#endif /* CCL_ENABLE_SYCL */
             break;
         }
+
         case ccl_coll_sparse_allreduce: {
             ccl_parallelizer_sparse_callback_ctx* i_ctx =
                 (ccl_parallelizer_sparse_callback_ctx*)part_scheds[0]
diff --git a/src/parallelizer/parallelizer.hpp b/src/parallelizer/parallelizer.hpp
index 008bccd42..6c3d30e44 100644
--- a/src/parallelizer/parallelizer.hpp
+++ b/src/parallelizer/parallelizer.hpp
@@ -34,5 +34,13 @@ class ccl_parallelizer {
     ccl::status process(ccl_master_sched* sched);
 
 private:
+    ccl::status process_deps(ccl_master_sched* sched);
+
+#ifdef CCL_ENABLE_SYCL
+    ccl::status process_pre_post_copies(ccl_master_sched* sched);
+#endif /* CCL_ENABLE_SYCL */
+
+    ccl::status process_base(ccl_master_sched* sched);
+
     size_t max_data_partition_count;
 };
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 0fc91107c..7a97b75c0 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -24,8 +24,9 @@ void coll_entry::start() {
 
     if (!coll_sched) {
         ccl_coll_param coll_param{};
-        coll_param.comm = sched->coll_param.comm;
         coll_param.ctype = param.ctype;
+        coll_param.comm = sched->coll_param.comm;
+        coll_param.stream = sched->coll_param.stream;
         coll_sched.reset(new ccl_extra_sched(coll_param, sched->sched_id));
         coll_sched->set_op_id(coll_sched_op_id);
 
diff --git a/src/sched/entry/coll/coll_entry.hpp b/src/sched/entry/coll/coll_entry.hpp
index 6b8fdfa10..d3fa6bdb5 100644
--- a/src/sched/entry/coll/coll_entry.hpp
+++ b/src/sched/entry/coll/coll_entry.hpp
@@ -15,12 +15,13 @@
 */
 #pragma once
 
+#include "common/global/global.hpp"
+#include "comp/comp.hpp"
 #include "sched/entry/coll/coll_entry_param.hpp"
 #include "sched/entry/entry.hpp"
 
 class coll_entry : public sched_entry,
                    public postponed_fields<coll_entry,
-                                           ccl_sched_entry_field_buf,
                                            ccl_sched_entry_field_send_buf,
                                            ccl_sched_entry_field_recv_buf,
                                            ccl_sched_entry_field_cnt,
@@ -53,10 +54,6 @@ class coll_entry : public sched_entry,
         return class_name();
     }
 
-    ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_buf> id) {
-        return param.buf;
-    }
-
     ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_send_buf> id) {
         return param.send_buf;
     }
@@ -84,8 +81,6 @@ class coll_entry : public sched_entry,
                            ccl::global_data::get().dtypes->name(param.dtype),
                            ", coll_type ",
                            ccl_coll_type_to_str(param.ctype),
-                           ", buf ",
-                           param.buf,
                            ", send_buf ",
                            param.send_buf,
                            ", recv_buf ",
diff --git a/src/sched/entry/coll/coll_entry_helper.cpp b/src/sched/entry/coll/coll_entry_helper.cpp
index 426e3a1aa..344f73b92 100644
--- a/src/sched/entry/coll/coll_entry_helper.cpp
+++ b/src/sched/entry/coll/coll_entry_helper.cpp
@@ -71,7 +71,7 @@ ccl::status coll_entry_helper::build_schedule(ccl_sched* sched,
         }
         case ccl_coll_bcast: {
             res = ccl_coll_build_bcast(
-                sched, param.buf, param.count, param.dtype, param.root, param.comm);
+                sched, param.recv_buf, param.count, param.dtype, param.root, param.comm);
             break;
         }
         case ccl_coll_reduce: {
diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/sched/entry/coll/coll_entry_param.hpp
index 64ce1d97d..431bf0997 100644
--- a/src/sched/entry/coll/coll_entry_param.hpp
+++ b/src/sched/entry/coll/coll_entry_param.hpp
@@ -19,7 +19,6 @@
 
 struct ccl_coll_entry_param {
     ccl_coll_type ctype;
-    ccl_buffer buf;
     ccl_buffer send_buf;
     ccl_buffer recv_buf;
     size_t count;
@@ -30,20 +29,22 @@ struct ccl_coll_entry_param {
     ccl::reduction reduction;
     int root;
     ccl_comm* comm;
+    ccl_stream* stream;
 
     ccl_coll_param to_coll_param() const {
         ccl_coll_param param;
         param.ctype = ctype;
-        param.buf = buf.get_ptr();
         param.send_buf = send_buf.get_ptr();
         param.recv_buf = recv_buf.get_ptr();
         param.count = count;
         param.send_count = send_count;
+        param.send_counts = send_counts;
         param.recv_counts = recv_counts;
         param.dtype = dtype;
         param.reduction = reduction;
         param.root = root;
         param.comm = comm;
+        param.stream = stream;
         return param;
     }
 };
diff --git a/src/sched/entry/coll/direct/alltoallv_entry.hpp b/src/sched/entry/coll/direct/alltoallv_entry.hpp
index 8e13f7a20..882a38e4b 100644
--- a/src/sched/entry/coll/direct/alltoallv_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoallv_entry.hpp
@@ -15,6 +15,7 @@
 */
 #pragma once
 
+#include "common/comm/comm.hpp"
 #include "sched/entry/coll/direct/base_coll_entry.hpp"
 
 class alltoallv_entry : public base_coll_entry {
diff --git a/src/sched/entry/coll/direct/base_coll_entry.hpp b/src/sched/entry/coll/direct/base_coll_entry.hpp
index 7c8284e12..96648371c 100644
--- a/src/sched/entry/coll/direct/base_coll_entry.hpp
+++ b/src/sched/entry/coll/direct/base_coll_entry.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "sched/entry/entry.hpp"
+#include "sched/queue/queue.hpp"
 
 class base_coll_entry : public sched_entry {
 public:
diff --git a/src/sched/entry/copy/copy_entry.hpp b/src/sched/entry/copy/copy_entry.hpp
new file mode 100644
index 000000000..d7a1e95e2
--- /dev/null
+++ b/src/sched/entry/copy/copy_entry.hpp
@@ -0,0 +1,162 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/copy/copy_helper.hpp"
+#include "sched/entry/entry.hpp"
+
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif /* CCL_ENABLE_SYCL */
+
+class copy_entry : public sched_entry,
+                   public postponed_fields<copy_entry,
+                                           ccl_sched_entry_field_in_buf,
+                                           ccl_sched_entry_field_cnt,
+                                           ccl_sched_entry_field_dtype> {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "COPY";
+    }
+
+    copy_entry() = delete;
+    copy_entry(ccl_sched* sched,
+               const ccl_buffer in_buf,
+               ccl_buffer out_buf,
+               size_t count,
+               const ccl_datatype& dtype,
+               size_t in_buf_offset = 0)
+            : sched_entry(sched),
+              in_buf(in_buf),
+              out_buf(out_buf),
+              count(count),
+              dtype(dtype),
+              in_buf_offset(in_buf_offset) {}
+
+    void start() override {
+        update_fields();
+
+#ifdef CCL_ENABLE_SYCL
+        ccl_stream* stream = (ccl_stream*)sched->coll_param.stream;
+
+        if (!stream) {
+            do_regular_copy();
+            return;
+        }
+
+        sycl::queue* q = stream->get_native_stream(sched->queue->get_idx());
+        CCL_THROW_IF_NOT(q, "null sycl queue");
+        auto in_ptr_type = sycl::get_pointer_type(in_buf.get_ptr(), q->get_context());
+        auto out_ptr_type = sycl::get_pointer_type(out_buf.get_ptr(), q->get_context());
+
+        LOG_DEBUG("in_ptr_type: ",
+                  native::detail::usm_to_string(in_ptr_type),
+                  ", out_ptr_type: ",
+                  native::detail::usm_to_string(out_ptr_type),
+                  ", native_stream: ",
+                  stream->to_string(),
+                  ", count: ",
+                  count)
+
+        if ((in_ptr_type != sycl::usm::alloc::device) &&
+            (out_ptr_type != sycl::usm::alloc::device)) {
+            do_regular_copy();
+            return;
+        }
+
+        copy_direction direction;
+
+        if ((in_ptr_type == sycl::usm::alloc::device) &&
+            (out_ptr_type == sycl::usm::alloc::device)) {
+            direction = copy_direction::d2d;
+        }
+
+        if ((in_ptr_type == sycl::usm::alloc::host) && (out_ptr_type == sycl::usm::alloc::device)) {
+            direction = copy_direction::h2d;
+        }
+
+        if ((in_ptr_type == sycl::usm::alloc::device) && (out_ptr_type == sycl::usm::alloc::host)) {
+            direction = copy_direction::d2h;
+        }
+
+        copier = sycl_copier(direction, in_buf, out_buf, count, dtype, 0);
+        copier.set_queue(q);
+        ccl_tuple_for_each_indexed<ccl_sycl_buffer_one_dim_types>(copier);
+        status = ccl_sched_entry_status_started;
+#else /* CCL_ENABLE_SYCL */
+        do_regular_copy();
+#endif /* CCL_ENABLE_SYCL */
+    }
+
+    void update() override {
+#ifdef CCL_ENABLE_SYCL
+        if (copier.is_completed()) {
+            status = ccl_sched_entry_status_complete;
+        }
+#endif /* CCL_ENABLE_SYCL */
+    }
+
+    void do_regular_copy() {
+        size_t bytes = count * dtype.size();
+        auto comp_status =
+            ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), count, dtype);
+        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
+        status = ccl_sched_entry_status_complete;
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_in_buf> id) {
+        return in_buf;
+    }
+
+    size_t& get_field_ref(field_id_t<ccl_sched_entry_field_cnt> id) {
+        return count;
+    }
+
+    ccl_datatype& get_field_ref(field_id_t<ccl_sched_entry_field_dtype> id) {
+        return dtype;
+    }
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str,
+                           "dt ",
+                           ccl::global_data::get().dtypes->name(dtype),
+                           ", count ",
+                           count,
+                           ", in_buf ",
+                           in_buf,
+                           ", out_buf ",
+                           out_buf,
+                           ", in_buf_offset ",
+                           in_buf_offset,
+                           "\n");
+    }
+
+private:
+    ccl_buffer in_buf;
+    ccl_buffer out_buf;
+    size_t count;
+    ccl_datatype dtype;
+    size_t in_buf_offset;
+
+#ifdef CCL_ENABLE_SYCL
+    sycl_copier copier;
+#endif /* CCL_ENABLE_SYCL */
+};
diff --git a/src/sched/entry/sycl_entry_helper.cpp b/src/sched/entry/copy/copy_helper.cpp
similarity index 66%
rename from src/sched/entry/sycl_entry_helper.cpp
rename to src/sched/entry/copy/copy_helper.cpp
index d5beb2552..8854d22de 100644
--- a/src/sched/entry/sycl_entry_helper.cpp
+++ b/src/sched/entry/copy/copy_helper.cpp
@@ -13,10 +13,10 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/sycl_entry_helper.hpp"
+#include "sched/entry/copy/copy_helper.hpp"
 
-using sycl_copy_direction_str_enum =
-    utils::enum_to_str<utils::enum_to_underlying(sycl_copy_direction::h2d) + 1>;
-std::string to_string(sycl_copy_direction val) {
-    return sycl_copy_direction_str_enum({ "D2H", "H2D" }).choose(val, "UNKNOWN");
+using copy_direction_str_enum =
+    utils::enum_to_str<utils::enum_to_underlying(copy_direction::d2d) + 1>;
+std::string to_string(copy_direction val) {
+    return copy_direction_str_enum({ "D2H", "H2D", "D2D" }).choose(val, "UNKNOWN");
 }
diff --git a/src/sched/entry/sycl_entry_helper.hpp b/src/sched/entry/copy/copy_helper.hpp
similarity index 76%
rename from src/sched/entry/sycl_entry_helper.hpp
rename to src/sched/entry/copy/copy_helper.hpp
index 1a273198f..ec8a44e33 100644
--- a/src/sched/entry/sycl_entry_helper.hpp
+++ b/src/sched/entry/copy/copy_helper.hpp
@@ -22,20 +22,21 @@
 #include "common/utils/tuple.hpp"
 #include "oneapi/ccl/native_device_api/interop_utils.hpp"
 
-enum class sycl_copy_direction { d2h, h2d };
-
-std::string to_string(sycl_copy_direction val);
+enum class copy_direction { d2h, h2d, d2d };
+std::string to_string(copy_direction val);
 
 #ifdef CCL_ENABLE_SYCL
 
-template <sycl_copy_direction direction>
 struct sycl_copier {
-    sycl_copier(ccl_buffer in_buf,
+    sycl_copier() = default;
+    sycl_copier(copy_direction direction,
+                ccl_buffer in_buf,
                 ccl_buffer out_buf,
                 size_t count,
                 const ccl_datatype& dtype,
                 size_t in_buf_offset)
-            : in_buf(in_buf),
+            : direction(direction),
+              in_buf(in_buf),
               out_buf(out_buf),
               count(count),
               dtype(dtype),
@@ -68,8 +69,19 @@ struct sycl_copier {
             void* in_buf_ptr = in_buf.get_ptr(bytes);
             void* out_buf_ptr = out_buf.get_ptr(bytes);
 
-            void* void_device_ptr =
-                (direction == sycl_copy_direction::h2d) ? out_buf_ptr : in_buf_ptr;
+            size_t offset = in_buf_offset;
+
+            if (direction == copy_direction::d2d) {
+                e = q->submit([&](sycl::handler& h) {
+                    h.memcpy(out_buf_ptr,
+                             static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) +
+                                 offset,
+                             bytes);
+                });
+                return;
+            }
+
+            void* void_device_ptr = (direction == copy_direction::h2d) ? out_buf_ptr : in_buf_ptr;
 
             /*
               don't print this pointer through CCL logger
@@ -83,6 +95,7 @@ struct sycl_copier {
             auto device_ptr_type = sycl::get_pointer_type(device_ptr, q->get_context());
 
             CCL_THROW_IF_NOT((device_ptr_type == sycl::usm::alloc::device ||
+                              device_ptr_type == sycl::usm::alloc::shared ||
                               device_ptr_type == sycl::usm::alloc::unknown),
                              "unexpected USM type ",
                              native::detail::usm_to_string(device_ptr_type),
@@ -91,13 +104,13 @@ struct sycl_copier {
 
             specific_sycl_buffer* device_buf_ptr = nullptr;
 
-            if (device_ptr_type == sycl::usm::alloc::device) {
-                /* do nothing, provided device USM pointer can be used as is in copy kernel */
-            }
-            else {
+            if (device_ptr_type == sycl::usm::alloc::unknown) {
                 /* cast pointer into SYCL buffer */
                 device_buf_ptr = static_cast<specific_sycl_buffer*>(void_device_ptr);
             }
+            else {
+                /* do nothing, provided USM pointer can be used as is in copy kernel */
+            }
 
             LOG_DEBUG("count: ",
                       count,
@@ -120,20 +133,16 @@ struct sycl_copier {
                       ", device_ptr usm_type: ",
                       native::detail::usm_to_string(device_ptr_type));
 
-            size_t offset = in_buf_offset;
-
             if (device_buf_ptr) {
                 specific_sycl_buffer host_buf(
                     static_cast<typename specific_sycl_buffer::value_type*>(
-                        (direction == sycl_copy_direction::h2d) ? in_buf_ptr : out_buf_ptr),
+                        (direction == copy_direction::h2d) ? in_buf_ptr : out_buf_ptr),
                     count,
                     sycl::property::buffer::use_host_ptr{});
 
                 e = q->submit([&](sycl::handler& h) {
-                    auto& src_buf =
-                        (direction == sycl_copy_direction::h2d) ? host_buf : *device_buf_ptr;
-                    auto& dst_buf =
-                        (direction == sycl_copy_direction::h2d) ? *device_buf_ptr : host_buf;
+                    auto& src_buf = (direction == copy_direction::h2d) ? host_buf : *device_buf_ptr;
+                    auto& dst_buf = (direction == copy_direction::h2d) ? *device_buf_ptr : host_buf;
                     auto src_buf_acc =
                         src_buf.template get_access<sycl::access::mode::read>(h, count, offset);
                     auto dst_buf_acc = dst_buf.template get_access<sycl::access::mode::write>(h);
@@ -141,13 +150,12 @@ struct sycl_copier {
                 });
             }
             else {
-                e = q->memcpy(
-                    out_buf_ptr,
-                    static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) + offset,
-                    count * dtype.size());
-
-                /* TODO: remove explicit wait */
-                e.wait();
+                e = q->submit([&](sycl::handler& h) {
+                    h.memcpy(out_buf_ptr,
+                             static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) +
+                                 offset,
+                             bytes);
+                });
             }
         }
         else {
@@ -160,10 +168,11 @@ struct sycl_copier {
         }
     }
 
+    copy_direction direction;
     ccl_buffer in_buf;
     ccl_buffer out_buf;
     size_t count;
-    const ccl_datatype& dtype;
+    ccl_datatype dtype;
     sycl::queue* q;
     size_t in_buf_offset;
     sycl::event e;
diff --git a/src/sched/entry/sycl_copy_entry.hpp b/src/sched/entry/copy/sycl_copy_entry.hpp
similarity index 82%
rename from src/sched/entry/sycl_copy_entry.hpp
rename to src/sched/entry/copy/sycl_copy_entry.hpp
index 8852e8a56..2ae7a2729 100644
--- a/src/sched/entry/sycl_copy_entry.hpp
+++ b/src/sched/entry/copy/sycl_copy_entry.hpp
@@ -17,18 +17,20 @@
 
 #ifdef CCL_ENABLE_SYCL
 
+#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/entry.hpp"
-#include "sched/entry/sycl_entry_helper.hpp"
 
 #include <CL/sycl.hpp>
 
-template <sycl_copy_direction direction>
 class sycl_copy_entry : public sched_entry {
 public:
-    static constexpr const char* class_name() noexcept;
+    static constexpr const char* class_name() noexcept {
+        return "SYCL_COPY";
+    }
 
     sycl_copy_entry() = delete;
     sycl_copy_entry(ccl_sched* sched,
+                    copy_direction direction,
                     ccl_buffer in_buf,
                     ccl_buffer out_buf,
                     size_t count,
@@ -36,13 +38,14 @@ class sycl_copy_entry : public sched_entry {
                     const ccl_stream* stream,
                     size_t offset = 0)
             : sched_entry(sched),
+              direction(direction),
               in_buf(in_buf),
               out_buf(out_buf),
               count(count),
               dtype(dtype),
               stream(stream),
               offset(offset),
-              copier(sycl_copier<direction>(in_buf, out_buf, count, dtype, offset)) {}
+              copier(sycl_copier(direction, in_buf, out_buf, count, dtype, offset)) {}
 
     void start() override {
         LOG_DEBUG(class_name(), ": in_buf ", in_buf, ", out_buf ", out_buf, ", count ", count);
@@ -65,7 +68,9 @@ class sycl_copy_entry : public sched_entry {
 protected:
     void dump_detail(std::stringstream& str) const override {
         ccl_logger::format(str,
-                           "  dtype ",
+                           "direction ",
+                           to_string(direction),
+                           ", dtype ",
                            ccl::global_data::get().dtypes->name(dtype),
                            ", count ",
                            count,
@@ -81,23 +86,14 @@ class sycl_copy_entry : public sched_entry {
     }
 
 private:
+    copy_direction direction;
     ccl_buffer in_buf;
     ccl_buffer out_buf;
     size_t count;
     ccl_datatype dtype;
     const ccl_stream* stream;
     size_t offset;
-    sycl_copier<direction> copier;
+    sycl_copier copier;
 };
 
-template <>
-constexpr const char* sycl_copy_entry<sycl_copy_direction::d2h>::class_name() noexcept {
-    return "SYCL_COPY_D2H";
-}
-
-template <>
-constexpr const char* sycl_copy_entry<sycl_copy_direction::h2d>::class_name() noexcept {
-    return "SYCL_COPY_H2D";
-}
-
 #endif /* CCL_ENABLE_SYCL */
diff --git a/src/sched/entry/copy_entry.hpp b/src/sched/entry/copy_entry.hpp
deleted file mode 100644
index 68508e08d..000000000
--- a/src/sched/entry/copy_entry.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "sched/entry/entry.hpp"
-
-class copy_entry : public sched_entry,
-                   public postponed_fields<copy_entry,
-                                           ccl_sched_entry_field_in_buf,
-                                           ccl_sched_entry_field_cnt,
-                                           ccl_sched_entry_field_dtype> {
-public:
-    static constexpr const char* class_name() noexcept {
-        return "COPY";
-    }
-
-    copy_entry() = delete;
-    copy_entry(ccl_sched* sched,
-               const ccl_buffer in_buf,
-               ccl_buffer out_buf,
-               size_t cnt,
-               const ccl_datatype& dtype)
-            : sched_entry(sched),
-              in_buf(in_buf),
-              out_buf(out_buf),
-              cnt(cnt),
-              dtype(dtype) {}
-
-    void start() override {
-        update_fields();
-
-        size_t bytes = cnt * dtype.size();
-        auto comp_status = ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), cnt, dtype);
-        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
-        status = ccl_sched_entry_status_complete;
-    }
-
-    const char* name() const override {
-        return class_name();
-    }
-
-    ccl_buffer& get_field_ref(field_id_t<ccl_sched_entry_field_in_buf> id) {
-        return in_buf;
-    }
-
-    size_t& get_field_ref(field_id_t<ccl_sched_entry_field_cnt> id) {
-        return cnt;
-    }
-
-    ccl_datatype& get_field_ref(field_id_t<ccl_sched_entry_field_dtype> id) {
-        return dtype;
-    }
-
-protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", in_buf ",
-                           in_buf,
-                           ", out_buf ",
-                           out_buf,
-                           "\n");
-    }
-
-private:
-    ccl_buffer in_buf;
-    ccl_buffer out_buf;
-    size_t cnt;
-    ccl_datatype dtype;
-};
diff --git a/src/sched/entry/deps_entry.hpp b/src/sched/entry/deps_entry.hpp
new file mode 100644
index 000000000..81464cdac
--- /dev/null
+++ b/src/sched/entry/deps_entry.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "sched/entry/entry.hpp"
+
+class deps_entry : public sched_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "DEPS";
+    }
+
+    deps_entry() = delete;
+    deps_entry(ccl_sched* sched) : sched_entry(sched) {}
+
+    void start() override {
+        std::vector<ccl::event>& deps = sched->get_deps();
+        for (size_t idx = 0; idx < deps.size(); idx++) {
+#ifdef CCL_ENABLE_SYCL
+            /* TODO: detect pure sycl::event and ccl::event for device op */
+            deps[idx].get_native().wait();
+#else /* CCL_ENABLE_SYCL */
+            deps[idx].wait();
+#endif /* CCL_ENABLE_SYCL */
+        }
+        status = ccl_sched_entry_status_complete;
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str, "deps.size ", sched->get_deps().size(), "\n");
+    }
+};
diff --git a/src/sched/entry/entry.cpp b/src/sched/entry/entry.cpp
index 112a6f4f2..320838dc4 100644
--- a/src/sched/entry/entry.cpp
+++ b/src/sched/entry/entry.cpp
@@ -14,16 +14,36 @@
  limitations under the License.
 */
 #include "sched/entry/entry.hpp"
+#include "sched/sched.hpp"
 #include "common/log/log.hpp"
 
 void sched_entry::do_progress() {
+    if (is_completed())
+        return;
+
+    // TODO: fix this tempropary workaround
+    // For l0 entry take_credit & return_credit isn't needed
+    // That's why we'd skip it
+    bool is_l0_entry = false;
+    const char* name_entry = this->name();
+
+    //  in case if entry is empty name or its  length = 1
+    if (strlen(name_entry) >= 2)
+        is_l0_entry = name_entry[0] == 'L' && name_entry[1] == '0';
+
     if (status < ccl_sched_entry_status_started) {
         CCL_ASSERT(
             status == ccl_sched_entry_status_not_started || status == ccl_sched_entry_status_again,
             "bad status ",
             status);
-        start();
-        CCL_ASSERT(status >= ccl_sched_entry_status_again, "bad status ", status);
+
+        if (is_l0_entry || sched->flow_control.take_credit()) {
+            start();
+            CCL_ASSERT(status >= ccl_sched_entry_status_again, "bad status ", status);
+        }
+        else {
+            status = ccl_sched_entry_status_again;
+        }
     }
     else if (status == ccl_sched_entry_status_started) {
         LOG_TRACE("update entry ", name());
@@ -31,9 +51,15 @@ void sched_entry::do_progress() {
         CCL_ASSERT(status >= ccl_sched_entry_status_started, "bad status ", status);
     }
 
+    if (status == ccl_sched_entry_status_complete && !is_l0_entry) {
+        sched->flow_control.return_credit();
+    }
+
     if (status == ccl_sched_entry_status_complete && exec_mode == ccl_sched_entry_exec_once) {
         status = ccl_sched_entry_status_complete_once;
     }
+
+    // TODO: what if status is ccl_sched_entry_status_failed or ccl_sched_entry_status_invalid?
 }
 
 bool sched_entry::is_completed() {
diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp
index 24fb2ec38..f816bd59a 100644
--- a/src/sched/entry/entry.hpp
+++ b/src/sched/entry/entry.hpp
@@ -20,6 +20,7 @@
 #include "common/utils/utils.hpp"
 #include "sched/entry/postponed_fields.hpp"
 #include "internal_types.hpp"
+
 #include <chrono>
 #include <memory>
 
diff --git a/src/sched/entry/factory/entry_factory.hpp b/src/sched/entry/factory/entry_factory.hpp
index 2d32e3b0c..83a3ab33e 100644
--- a/src/sched/entry/factory/entry_factory.hpp
+++ b/src/sched/entry/factory/entry_factory.hpp
@@ -15,24 +15,6 @@
 */
 #pragma once
 
-#include "sched/entry/factory/entry_factory.h"
-
-#include "sched/entry/send_entry.hpp"
-#include "sched/entry/recv_entry.hpp"
-#include "sched/entry/write_entry.hpp"
-#include "sched/entry/reduce_local_entry.hpp"
-#include "sched/entry/recv_reduce_entry.hpp"
-#include "sched/entry/copy_entry.hpp"
-#include "sched/entry/sync_entry.hpp"
-#include "sched/entry/prologue_entry.hpp"
-#include "sched/entry/epilogue_entry.hpp"
-#include "sched/entry/sparse_allreduce_completion_entry.hpp"
-#include "sched/entry/wait_value_entry.hpp"
-#include "sched/entry/function_entry.hpp"
-#include "sched/entry/probe_entry.hpp"
-#include "sched/entry/register_entry.hpp"
-#include "sched/entry/deregister_entry.hpp"
-#include "sched/entry/subsched_entry.hpp"
 #include "sched/entry/coll/coll_entry.hpp"
 #include "sched/entry/coll/direct/allgatherv_entry.hpp"
 #include "sched/entry/coll/direct/allreduce_entry.hpp"
@@ -43,9 +25,28 @@
 #include "sched/entry/coll/direct/reduce_entry.hpp"
 #include "sched/entry/coll/direct/reduce_scatter_entry.hpp"
 
+#include "sched/entry/factory/entry_factory.h"
+
+#include "sched/entry/copy/copy_entry.hpp"
 #ifdef CCL_ENABLE_SYCL
-#include "sched/entry/sycl_copy_entry.hpp"
+#include "sched/entry/copy/sycl_copy_entry.hpp"
 #endif /* CCL_ENABLE_SYCL */
+#include "sched/entry/deps_entry.hpp"
+#include "sched/entry/deregister_entry.hpp"
+#include "sched/entry/epilogue_entry.hpp"
+#include "sched/entry/function_entry.hpp"
+#include "sched/entry/probe_entry.hpp"
+#include "sched/entry/prologue_entry.hpp"
+#include "sched/entry/recv_entry.hpp"
+#include "sched/entry/recv_reduce_entry.hpp"
+#include "sched/entry/reduce_local_entry.hpp"
+#include "sched/entry/register_entry.hpp"
+#include "sched/entry/send_entry.hpp"
+#include "sched/entry/sparse_allreduce_completion_entry.hpp"
+#include "sched/entry/subsched_entry.hpp"
+#include "sched/entry/sync_entry.hpp"
+#include "sched/entry/wait_value_entry.hpp"
+#include "sched/entry/write_entry.hpp"
 
 #include "sched/sched.hpp"
 
diff --git a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
index be11e5cd1..805464937 100644
--- a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
@@ -22,9 +22,8 @@
 //TODO L0 Workaround
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
-                                                        gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_allgatherv_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                         topology,
                                                         ccl::device_topology_type::ring,
                                                         ccl_coll_allgatherv> {
@@ -32,8 +31,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
+    using base = base_gpu_entry<gpu_comm_impl,
                                 topology,
                                 ccl::device_topology_type::ring,
                                 ccl_coll_allgatherv>;
@@ -45,17 +43,17 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
     using base::kernel_router;
     using base::get_ctx;
     using base::get_local_kernel;
-    using kernel_main_typed = ring_allgatherv_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_allgatherv_ipc<kernel_params>;
+    using kernel_main_typed = ring::allgatherv::main_kernel;
+    using processing_type = void;
 
     using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::allgatherv::income_data_flag_arg_type>::type;
     using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::allgatherv::ready_to_recv_flag_arg_type>::type;
     using recv_counts_typed_entry_type = typename std::remove_pointer<
-        typename kernel_main_typed::recv_elem_counts_buf_arg_type>::type;
+        typename ring::allgatherv::recv_elem_counts_buf_arg_type>::type;
     using recv_offsets_typed_entry_type = typename std::remove_pointer<
-        typename kernel_main_typed::recv_elem_offsets_buf_arg_type>::type;
+        typename ring::allgatherv::recv_elem_offsets_buf_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_ALLGATHERV_TYPED";
@@ -75,28 +73,24 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
         size_t send_count,
         ccl_buffer recv_buf,
         const size_t* recv_counts,
+        const coll_param_gpu& params,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   send_buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
+            : base(sched, comm, in_ctx, send_buf, params, device_stream),
               // left_wrote_to_me_flag
               income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
+                  typename ring::allgatherv::income_data_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
               // ready_to_recv_flag_arg
               ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
+                  typename ring::allgatherv::ready_to_recv_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
               recv_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
-                                      send_count,
+                                      comm_addr.size,
                                       sizeof(recv_counts_typed_entry_type),
                                       get_ctx())),
 
@@ -126,8 +120,8 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_allgatherv_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_allgatherv_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -148,15 +142,15 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
 
         auto& main_entry_function = get_local_kernel();
 
-        auto recv_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type*>(recv_buf_entry.get_ptr());
+        auto recv_buf_ptr = reinterpret_cast<processing_type*>(recv_buf_entry.get_ptr());
+
         //create implementation specified primitives
         main_entry_function
-            .template set_args<typename kernel_main_typed::income_data_flag_arg,
-                               typename kernel_main_typed::ready_to_recv_flag_arg,
-                               typename kernel_main_typed::recv_buf_arg,
-                               typename kernel_main_typed::recv_elem_counts_buf_arg,
-                               typename kernel_main_typed::recv_elem_offsets_buf_arg,
+            .template set_args<typename ring::allgatherv::income_data_flag_arg,
+                               typename ring::allgatherv::ready_to_recv_flag_arg,
+                               typename ring::allgatherv::recv_buf_arg<processing_type>,
+                               typename ring::allgatherv::recv_elem_counts_buf_arg,
+                               typename ring::allgatherv::recv_elem_offsets_buf_arg,
                                typename kernel_main_typed::common_entry_buf_size_arg>(
                 income_data_flag.get(),
                 ready_to_recv_flag.get(),
@@ -181,9 +175,11 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
+        auto recv_buf_ptr = reinterpret_cast<processing_type*>(recv_buf_entry.get_ptr());
+
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
+        ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
@@ -204,110 +200,64 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<kernel_params,
     std::shared_ptr<ccl_context> ctx;
 
 public:
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::recv_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::recv_buf_arg::return_t right_output_buf =
-                right_kernel.template get_arg<typename kernel_main_typed::recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
-
-            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
-            ENTRY_LOG_TRACE("Args: \n{ ",
-                            right_output_buf.first,
-                            ", ",
-                            right_output_buf.second,
-                            "}\n",
-                            "{ ",
-                            right_income_data_flag_arg.first,
-                            ", ",
-                            right_income_data_flag_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_ready_to_recv_flag_arg.first,
-                            ", ",
-                            right_ready_to_recv_flag_arg.second,
-                            "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_output_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_output_buf.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
+            right_kernel
+                .template test_args<typename ring::allgatherv::recv_buf_arg<processing_type>,
+                                    typename ring::allgatherv::income_data_flag_arg,
+                                    typename ring::allgatherv::ready_to_recv_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
 
-            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
-
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args< //typename kernel_ipc_typed::right_output_buf_arg,
-                typename kernel_ipc_typed::income_data_flag_arg,
-                typename kernel_ipc_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::right_output_buf_arg::return_t right_output_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::right_output_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_output_buf_arg.first,
-                      ", ",
-                      right_output_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-            main_entry_function
-                .template set_args< //typename kernel_main_typed::right_output_buf_arg,
-                    typename kernel_main_typed::right_income_data_flag_arg,
-                    typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    //right_output_buf_arg.second,
+            auto right_recv_buf_arg =
+                right_kernel
+                    .template get_arg<typename ring::allgatherv::recv_buf_arg<processing_type>>();
+            auto right_income_data_flag_arg =
+                right_kernel.template get_arg<typename ring::allgatherv::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel.template get_arg<typename ring::allgatherv::ready_to_recv_flag_arg>();
+
+            // ENTRY_LOG_DEBUG("Bind right arguments from ",
+            //                 right_kernel_t::name(),
+            //                 " kernel",
+            //                 " to ",
+            //                 left_kernel_t::name(),
+            //                 " kernel. "
+            //                 "Right arguments:\n{ ",
+            //                 right_recv_buf_arg.first,
+            //                 ", ",
+            //                 right_recv_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_income_data_flag_arg.first,
+            //                 ", ",
+            //                 right_income_data_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_ready_to_recv_flag_arg.first,
+            //                 ", ",
+            //                 right_ready_to_recv_flag_arg.second,
+            //                 "}\n");
+
+            left_kernel
+                .template set_args<typename ring::allgatherv::right_output_buf_arg<processing_type>,
+                                   typename ring::allgatherv::right_income_data_flag_arg,
+                                   typename ring::allgatherv::right_ready_to_recv_flag_arg>(
+                    right_recv_buf_arg.second,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_output_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
         }
         return is_right_kernel_ready;
     }
diff --git a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
index e03184354..f0a1395d2 100644
--- a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
@@ -19,12 +19,12 @@
 #include <atomic>
 
 #include "sched/entry/l0/l0_entry.hpp"
-#include "common/comm/l0/context/scaling_ctx/ipc_ctx_impl.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp"
+#include "kernels/shared.h"
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
-                                                       gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_allreduce_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                        topology,
                                                        ccl::device_topology_type::ring,
                                                        ccl_coll_allreduce> {
@@ -32,8 +32,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
+    using base = base_gpu_entry<gpu_comm_impl,
                                 topology,
                                 ccl::device_topology_type::ring,
                                 ccl_coll_allreduce>;
@@ -46,16 +45,14 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
     using base::get_ctx;
     using base::alloc_memory_wrap;
     using base::get_local_kernel;
-    using kernel_main_typed = ring_allreduce_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_allreduce_ipc<kernel_params>;
-    using kernel_main_numa_typed = ring_allreduce_numa_kernel<kernel_params>;
+    using kernel_main_typed = ring::allreduce::main_kernel;
 
     using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::allreduce::income_data_flag_arg_type>::type;
     using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::allreduce::ready_to_recv_flag_arg_type>::type;
     using local_barrier_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::allreduce::local_barrier_flag_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_ALLREDUCE_TYPED";
@@ -74,27 +71,24 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
         const ccl_buffer send_buf,
         ccl_buffer recv_buf,
         size_t cnt,
-        ccl::reduction op,
+        const coll_param_gpu& params,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   send_buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
-
-              temp_buffer(
-                  this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{},
-                                                   parent_communicator,
-                                                   cnt,
-                                                   get_ctx())),
-              income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
+            : base(sched, comm, in_ctx, send_buf, params, device_stream),
+
+              temp_buffer(this->template alloc_memory_wrap(
+                  typename ring::allreduce::tmp_recv_buf_arg<int8_t>{},
                   parent_communicator,
-                  1,
+                  ring_allreduce_get_tmp_buffer_size(
+                      ccl::get_datatype_size(params.get_datatype()) * cnt,
+                      base::comm_addr.size),
                   get_ctx())),
+              income_data_flag(
+                  this->template alloc_memory_wrap(typename ring::allreduce::income_data_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
               ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
+                  typename ring::allreduce::ready_to_recv_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
@@ -104,13 +98,12 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
                                          sizeof(local_barrier_flag_gpu_type),
                                          get_ctx())) {
         recv_buf_typed_entry = recv_buf;
-        op_typed_entry = op;
         cnt_entry = cnt;
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_allreduce_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_allreduce_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -131,16 +124,15 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
         auto& main_entry_function = get_local_kernel();
 
         // TODO: try to remove indirect buffer
-        auto recv_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr());
+        void* recv_buf_ptr = recv_buf_typed_entry.get_ptr();
 
         //create implementation specified primitives
         main_entry_function
-            .template set_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                               typename kernel_main_typed::income_data_flag_arg,
-                               typename kernel_main_typed::ready_to_recv_flag_arg,
-                               typename kernel_main_typed::local_barrier_flag_arg,
-                               typename kernel_main_typed::recv_buf_arg,
+            .template set_args<typename ring::allreduce::tmp_recv_buf_arg<void>,
+                               typename ring::allreduce::income_data_flag_arg,
+                               typename ring::allreduce::ready_to_recv_flag_arg,
+                               typename ring::allreduce::local_barrier_flag_arg,
+                               typename ring::allreduce::recv_buf_arg<void>,
                                typename kernel_main_typed::common_entry_buf_size_arg>(
                 temp_buffer.get(),
                 income_data_flag.get(),
@@ -165,7 +157,6 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
         ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
@@ -174,16 +165,17 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
         return ret;
     }
 
-    observer::invoke_params<type(), kernel_params> get_numa_data() override {
+    observer::invoke_params<type()> get_numa_data() override {
         observer::producer_description in_params{
-            .world_rank = comm_addr.rank, //TODO unused
-            .world_size = comm_addr.size, //TODO unused
+            .rank = comm_addr.rank, //TODO unused
+            .comm_size = comm_addr.size, //TODO unused
             .staged_buffer_elem_count = cnt_entry,
             .context = get_ctx(),
             .device = parent_communicator->get_device(),
             .immediate_list = parent_communicator->get_device().create_immediate_cmd_list(get_ctx())
         };
-        return observer::invoke_params<type(), kernel_params>{ std::move(in_params) };
+        // TODO: Should get_params() be a part of in_params?
+        return observer::invoke_params<type()>(std::move(in_params), base::get_params());
     }
 
 protected:
@@ -192,97 +184,44 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
     }
 
 private:
-    ccl_device::device_memory<typename kernel_params::native_type> temp_buffer;
+    ccl_device::device_memory<int8_t> temp_buffer;
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
     ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag;
-    ccl::reduction op_typed_entry;
     ccl_buffer recv_buf_typed_entry;
     size_t cnt_entry;
 
 public:
-    template <class left_kernel_main_typed, class right_kernel_main_typed>
-    bool execute(left_kernel_main_typed& main_entry_function,
-                 right_kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel
-                .template test_args<typename right_kernel_main_typed::tmp_recv_buf_arg,
-                                    typename right_kernel_main_typed::income_data_flag_arg,
-                                    typename right_kernel_main_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            // Once we're sure that the parameters ready read them from the right kernel
-            // Note: we not only read the parameters but also reset their 'ready' flag
-            // (since we're using a dedicated policy)  meaning that they must be stored in order
-            // to be read again.
-            // This is a protection to a case of multiple kernel launches(i.e. the collective is ran multiple times)
-            // where we might read not up-to-date values from the previous run.
-            //TODO do not get arguments sequencially - use array version instead
-            typename right_kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename right_kernel_main_typed::tmp_recv_buf_arg>();
-            typename right_kernel_main_typed::income_data_flag_arg::return_t
-                right_income_data_flag_arg =
-                    right_kernel
-                        .template get_arg<typename right_kernel_main_typed::income_data_flag_arg>();
-            typename right_kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg = right_kernel.template get_arg<
-                    typename right_kernel_main_typed::ready_to_recv_flag_arg>();
-
-            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", left_kernel_main_typed::name());
-            ENTRY_LOG_DEBUG("Args: \n{ ",
-                            right_tmp_recv_buf_arg.first,
-                            ", ",
-                            right_tmp_recv_buf_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_income_data_flag_arg.first,
-                            ", ",
-                            right_income_data_flag_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_ready_to_recv_flag_arg.first,
-                            ", ",
-                            right_ready_to_recv_flag_arg.second,
-                            "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename left_kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename left_kernel_main_typed::right_income_data_flag_arg,
-                                   typename left_kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
+            right_kernel.template test_args<typename ring::allreduce::tmp_recv_buf_arg<void>,
+                                            typename ring::allreduce::income_data_flag_arg,
+                                            typename ring::allreduce::ready_to_recv_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
 
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
-            // Once we're sure that the parameters ready read them from the right kernel
-            // Note: we not only read the parameters but also reset their 'ready' flag
-            // (since we're using a dedicated policy)  meaning that they must be stored in order
-            // to be read again.
-            // This is a protection to a case of multiple kernel launches(i.e. the collective is ran multiple times)
-            // where we might read not up-to-date values from the previous run.
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
-
-            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
-            ENTRY_LOG_DEBUG("Args: \n{ ",
+            auto right_tmp_recv_buf_arg =
+                right_kernel.template get_arg<typename ring::allreduce::tmp_recv_buf_arg<void>>();
+            auto right_income_data_flag_arg =
+                right_kernel.template get_arg<typename ring::allreduce::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel.template get_arg<typename ring::allreduce::ready_to_recv_flag_arg>();
+
+            /*ENTRY_LOG_DEBUG("Bind right arguments from ",
+                            right_kernel_t::name(),
+                            " kernel",
+                            " to ",
+                            left_kernel_t::name(),
+                            " kernel. "
+                            "Right arguments:\n{ ",
                             right_tmp_recv_buf_arg.first,
                             ", ",
                             right_tmp_recv_buf_arg.second,
@@ -296,80 +235,18 @@ class l0_allreduce_typed_entry : public base_gpu_entry<kernel_params,
                             right_ready_to_recv_flag_arg.first,
                             ", ",
                             right_ready_to_recv_flag_arg.second,
-                            "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
-
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg,
-                                            typename kernel_ipc_typed::income_data_flag_arg,
-                                            typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+                            "}\n");*/
+
+            left_kernel.template set_args<typename ring::allreduce::right_tmp_recv_buf_arg<void>,
+                                          typename ring::allreduce::right_income_data_flag_arg,
+                                          typename ring::allreduce::right_ready_to_recv_flag_arg>(
+                right_tmp_recv_buf_arg.second,
+                right_income_data_flag_arg.second,
+                right_ready_to_recv_flag_arg.second);
+
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
         }
         return is_right_kernel_ready;
     }
diff --git a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
index 0b940cbf5..8970e06a8 100644
--- a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
@@ -22,9 +22,8 @@
 //TODO L0 Workaround
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
-                                                       gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_alltoallv_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                        topology,
                                                        ccl::device_topology_type::ring,
                                                        ccl_coll_alltoallv> {
@@ -32,8 +31,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
+    using base = base_gpu_entry<gpu_comm_impl,
                                 topology,
                                 ccl::device_topology_type::ring,
                                 ccl_coll_alltoallv>;
@@ -45,26 +43,25 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
     using base::kernel_router;
     using base::get_ctx;
     using base::get_local_kernel;
-    using kernel_main_typed = ring_alltoallv_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_alltoallv_ipc<kernel_params>;
+    using kernel_main_typed = ring::alltoallv::main_kernel;
 
     using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::alltoallv::income_data_flag_arg_type>::type;
     using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::alltoallv::ready_to_recv_flag_arg_type>::type;
 
-    using recv_counts_typed_entry_type = typename std::remove_pointer<
-        typename kernel_main_typed::recv_elem_counts_buf_arg_type>::type;
+    using recv_counts_typed_entry_type =
+        typename std::remove_pointer<typename ring::alltoallv::recv_elem_counts_buf_arg_type>::type;
     using recv_offsets_typed_entry_type = typename std::remove_pointer<
-        typename kernel_main_typed::recv_elem_offsets_buf_arg_type>::type;
+        typename ring::alltoallv::recv_elem_offsets_buf_arg_type>::type;
 
     using proxy_size_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::proxy_size_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::alltoallv::proxy_size_flag_arg_type>::type;
 
     using send_counts_typed_entry_type =
-        typename std::remove_pointer<typename kernel_main_typed::send_buf_size_arg_type>::type;
+        typename std::remove_pointer<typename ring::alltoallv::send_buf_size_arg_type>::type;
     using send_offsets_typed_entry_type = typename std::remove_pointer<
-        typename kernel_main_typed::send_elem_offsets_buf_arg_type>::type;
+        typename ring::alltoallv::send_elem_offsets_buf_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_ALLTOALLV_TYPED";
@@ -82,40 +79,38 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
         ccl_driver_context_ptr in_ctx,
         const ccl_buffer send_buf,
         const size_t* send_counts,
+        size_t total_send_counts,
         ccl_buffer recv_buf,
         const size_t* recv_counts,
+        size_t total_recv_counts,
+        const coll_param_gpu& params,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   send_buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
-              temp_buffer(
-                  this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{},
-                                                   parent_communicator,
-                                                   512,
-                                                   get_ctx())),
-              // left_wrote_to_me_flag
-              income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
+            : base(sched, comm, in_ctx, send_buf, params, device_stream),
+              temp_buffer(this->template alloc_memory_wrap(
+                  typename ring::alltoallv::tmp_recv_buf_arg<uint8_t>{},
                   parent_communicator,
-                  1,
+                  total_recv_counts,
                   get_ctx())),
+              // left_wrote_to_me_flag
+              income_data_flag(
+                  this->template alloc_memory_wrap(typename ring::alltoallv::income_data_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
               // ready_to_recv_flag_arg
               ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
-                  parent_communicator,
-                  1,
-                  get_ctx())),
-              proxy_size_flag_entry(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::proxy_size_flag_arg{},
+                  typename ring::alltoallv::ready_to_recv_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
+              proxy_size_flag_entry(
+                  this->template alloc_memory_wrap(typename ring::alltoallv::proxy_size_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
               recv_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
-                                      512,
+                                      total_recv_counts,
                                       sizeof(recv_counts_typed_entry_type),
                                       get_ctx())),
               recv_offsets_buf(parent_communicator->get_device()
@@ -125,16 +120,14 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
                                        get_ctx())),
               send_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
-                                      512,
+                                      total_send_counts,
                                       sizeof(recv_counts_typed_entry_type),
                                       get_ctx())),
               send_offsets_buf(parent_communicator->get_device()
                                    .template alloc_memory<send_offsets_typed_entry_type>(
                                        comm_addr.size,
                                        sizeof(send_offsets_typed_entry_type),
-                                       get_ctx()))
-
-    {
+                                       get_ctx())) {
         // copy recv_buf into recv_buf_entry
         recv_buf_entry = recv_buf;
 
@@ -165,8 +158,8 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_alltoallv_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_alltoallv_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -188,18 +181,17 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
 
         auto& main_entry_function = get_local_kernel();
 
-        auto recv_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type*>(recv_buf_entry.get_ptr());
-        // auto send_counts_ptr = reinterpret_cast<size_t*>(send_counts_entry.get_ptr());
+        auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_entry.get_ptr());
+
         //create implementation specified primitives
-        main_entry_function.template set_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                                              typename kernel_main_typed::income_data_flag_arg,
-                                              typename kernel_main_typed::ready_to_recv_flag_arg,
-                                              typename kernel_main_typed::recv_buf_arg,
-                                              typename kernel_main_typed::recv_elem_counts_buf_arg,
-                                              typename kernel_main_typed::recv_elem_offsets_buf_arg,
-                                              typename kernel_main_typed::proxy_size_flag_arg,
-                                              typename kernel_main_typed::send_buf_size_arg>(
+        main_entry_function.template set_args<typename ring::alltoallv::tmp_recv_buf_arg<void>,
+                                              typename ring::alltoallv::income_data_flag_arg,
+                                              typename ring::alltoallv::ready_to_recv_flag_arg,
+                                              typename ring::alltoallv::recv_buf_arg<void>,
+                                              typename ring::alltoallv::recv_elem_counts_buf_arg,
+                                              typename ring::alltoallv::recv_elem_offsets_buf_arg,
+                                              typename ring::alltoallv::proxy_size_flag_arg,
+                                              typename ring::alltoallv::send_buf_size_arg>(
             temp_buffer.get(),
             income_data_flag.get(),
             ready_to_recv_flag.get(),
@@ -225,11 +217,14 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
-        ret.reserve(3);
+        ret.reserve(4);
+        ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
+        ret.push_back(
+            owned_device.create_ipc_memory_handle(proxy_size_flag_entry.get(), get_ctx()));
+
         return ret;
     }
 
@@ -239,7 +234,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
     }
 
 private:
-    ccl_device::device_memory<typename kernel_params::native_type> temp_buffer;
+    ccl_device::device_memory<uint8_t> temp_buffer;
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
     ccl_device::device_memory<proxy_size_flag_gpu_type> proxy_size_flag_entry;
@@ -251,119 +246,72 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<kernel_params,
     std::shared_ptr<ccl_context> ctx;
 
 public:
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg,
-                                            typename kernel_main_typed::proxy_size_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
-
-            typename kernel_main_typed::proxy_size_flag_arg::return_t right_proxy_size_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::proxy_size_flag_arg>();
-
-            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
-            ENTRY_LOG_TRACE("Args: \n{ ",
-                            right_tmp_recv_buf_arg.first,
-                            ", ",
-                            right_tmp_recv_buf_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_income_data_flag_arg.first,
-                            ", ",
-                            right_income_data_flag_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_ready_to_recv_flag_arg.first,
-                            ", ",
-                            right_ready_to_recv_flag_arg.second,
-                            "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg,
-                                   typename kernel_main_typed::right_proxy_size_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second,
-                    right_proxy_size_flag_arg.second);
-            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
-
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        bool is_right_kernel_ready = false;
-        /* TODO UNSUPPORTED
+            right_kernel.template test_args<typename ring::alltoallv::tmp_recv_buf_arg<void>,
+                                            typename ring::alltoallv::income_data_flag_arg,
+                                            typename ring::alltoallv::ready_to_recv_flag_arg,
+                                            typename ring::alltoallv::proxy_size_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
 
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args< //typename kernel_ipc_typed::right_output_buf_arg,
-                typename kernel_ipc_typed::income_data_flag_arg,
-                typename kernel_ipc_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::right_output_buf_arg::return_t right_output_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::right_output_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_output_buf_arg.first,
-                      ", ",
-                      right_output_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-            main_entry_function
-                .template set_args< //typename kernel_main_typed::right_output_buf_arg,
-                    typename kernel_main_typed::right_income_data_flag_arg,
-                    typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    //right_output_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_output_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
-        }*/
+            auto right_tmp_recv_buf_arg =
+                right_kernel.template get_arg<typename ring::alltoallv::tmp_recv_buf_arg<void>>();
+            auto right_income_data_flag_arg =
+                right_kernel.template get_arg<typename ring::alltoallv::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel.template get_arg<typename ring::alltoallv::ready_to_recv_flag_arg>();
+            auto right_proxy_size_flag_arg =
+                right_kernel.template get_arg<typename ring::alltoallv::proxy_size_flag_arg>();
+
+            // ENTRY_LOG_DEBUG("Bind right arguments from ",
+            //                 right_kernel_t::name(),
+            //                 " kernel",
+            //                 " to ",
+            //                 left_kernel_t::name(),
+            //                 " kernel. "
+            //                 "Right arguments:\n{ ",
+            //                 right_tmp_recv_buf_arg.first,
+            //                 ", ",
+            //                 right_tmp_recv_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_income_data_flag_arg.first,
+            //                 ", ",
+            //                 right_income_data_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_ready_to_recv_flag_arg.first,
+            //                 ", ",
+            //                 right_ready_to_recv_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_proxy_size_flag_arg.first,
+            //                 ", ",
+            //                 right_proxy_size_flag_arg.second,
+            //                 "}\n");
+
+            left_kernel.template set_args<typename ring::alltoallv::right_tmp_recv_buf_arg<void>,
+                                          typename ring::alltoallv::right_income_data_flag_arg,
+                                          typename ring::alltoallv::right_ready_to_recv_flag_arg,
+                                          typename ring::alltoallv::right_proxy_size_flag_arg>(
+                right_tmp_recv_buf_arg.second,
+                right_income_data_flag_arg.second,
+                right_ready_to_recv_flag_arg.second,
+                right_proxy_size_flag_arg.second);
+
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
+        }
         return is_right_kernel_ready;
     }
 };
diff --git a/src/sched/entry/l0/l0_bcast_typed_entry.hpp b/src/sched/entry/l0/l0_bcast_typed_entry.hpp
index b40141fe1..c40b7ddd3 100644
--- a/src/sched/entry/l0/l0_bcast_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_bcast_typed_entry.hpp
@@ -21,9 +21,8 @@
 //TODO L0 Workaround
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
-                                                   gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_bcast_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                    topology,
                                                    ccl::device_topology_type::ring,
                                                    ccl_coll_bcast> {
@@ -31,11 +30,8 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
-                                topology,
-                                ccl::device_topology_type::ring,
-                                ccl_coll_bcast>;
+    using base =
+        base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_bcast>;
     using base::parent_communicator;
     using base::comm_addr;
     using base::req;
@@ -44,15 +40,15 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
     using base::kernel_router;
     using base::get_ctx;
     using base::get_local_kernel;
-    using kernel_main_typed = ring_bcast_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_bcast_ipc<kernel_params>;
+    using kernel_main_typed = ring::bcast::main_kernel;
+    using processing_type = void;
 
     using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::bcast::income_data_flag_arg_type>::type;
     using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::bcast::ready_to_recv_flag_arg_type>::type;
     using local_barrier_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::bcast::local_barrier_flag_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_BCAST_TYPED";
@@ -70,24 +66,20 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
                          ccl_buffer buf,
                          size_t cnt,
                          int root,
+                         const coll_param_gpu& params,
                          std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
-
-              income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
-                  parent_communicator,
-                  1,
-                  get_ctx())),
-              ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
-                  parent_communicator,
-                  1,
-                  get_ctx())),
+            : base(sched, comm, in_ctx, buf, params, device_stream),
+
+              income_data_flag(
+                  this->template alloc_memory_wrap(typename ring::bcast::income_data_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
+              ready_to_recv_flag(
+                  this->template alloc_memory_wrap(typename ring::bcast::ready_to_recv_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
               local_barrier_flag(parent_communicator->get_device()
                                      .template alloc_memory<local_barrier_flag_gpu_type>(
                                          1,
@@ -98,8 +90,8 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_bcast_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_bcast_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -123,10 +115,10 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
 
         //create implementation specified primitives
         main_entry_function
-            .template set_args<typename kernel_main_typed::income_data_flag_arg,
-                               typename kernel_main_typed::ready_to_recv_flag_arg,
-                               typename kernel_main_typed::local_barrier_flag_arg,
-                               typename kernel_main_typed::root_arg,
+            .template set_args<typename ring::bcast::income_data_flag_arg,
+                               typename ring::bcast::ready_to_recv_flag_arg,
+                               typename ring::bcast::local_barrier_flag_arg,
+                               typename ring::bcast::root_arg,
                                typename kernel_main_typed::common_entry_buf_size_arg>(
                 income_data_flag.get(),
                 ready_to_recv_flag.get(),
@@ -150,9 +142,11 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
+        auto recv_buf_ptr = reinterpret_cast<processing_type*>(base::send_buf.get_ptr());
+
         std::vector<ccl_device::device_ipc_memory_handle> ret;
-        ret.reserve(2);
+        ret.reserve(3);
+        ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
@@ -172,109 +166,61 @@ class l0_bcast_typed_entry : public base_gpu_entry<kernel_params,
     std::shared_ptr<ccl_context> ctx;
 
 public:
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::common_entry_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::common_entry_buf_arg::return_t right_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::common_entry_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
+            right_kernel.template test_args<typename ring::bcast::buf_arg<processing_type>,
+                                            typename ring::bcast::income_data_flag_arg,
+                                            typename ring::bcast::ready_to_recv_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
 
-            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
-            ENTRY_LOG_TRACE("Args: \n{ ",
-                            right_buf_arg.first,
-                            ", ",
-                            right_buf_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_income_data_flag_arg.first,
-                            ", ",
-                            right_income_data_flag_arg.second,
-                            "}\n",
-                            "{ ",
-                            right_ready_to_recv_flag_arg.first,
-                            ", ",
-                            right_ready_to_recv_flag_arg.second,
-                            "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
-
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_ipc_typed::recv_buf_arg,
-                                            typename kernel_ipc_typed::income_data_flag_arg,
-                                            typename kernel_ipc_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::common_entry_buf_arg::return_t right_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_buf_arg.first,
-                      ", ",
-                      right_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            auto right_buf_arg =
+                right_kernel.template get_arg<typename ring::bcast::buf_arg<processing_type>>();
+            auto right_income_data_flag_arg =
+                right_kernel.template get_arg<typename ring::bcast::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel.template get_arg<typename ring::bcast::ready_to_recv_flag_arg>();
+
+            // ENTRY_LOG_DEBUG("Bind right arguments from ",
+            //                 right_kernel_t::name(),
+            //                 " kernel",
+            //                 " to ",
+            //                 left_kernel_t::name(),
+            //                 " kernel. "
+            //                 "Right arguments:\n{ ",
+            //                 right_buf_arg.first,
+            //                 ", ",
+            //                 right_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_income_data_flag_arg.first,
+            //                 ", ",
+            //                 right_income_data_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_ready_to_recv_flag_arg.first,
+            //                 ", ",
+            //                 right_ready_to_recv_flag_arg.second,
+            //                 "}\n");
+
+            left_kernel.template set_args<typename ring::bcast::right_buf_arg<processing_type>,
+                                          typename ring::bcast::right_income_data_flag_arg,
+                                          typename ring::bcast::right_ready_to_recv_flag_arg>(
+                right_buf_arg.second,
+                right_income_data_flag_arg.second,
+                right_ready_to_recv_flag_arg.second);
+
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
         }
         return is_right_kernel_ready;
     }
diff --git a/src/sched/entry/l0/l0_entry.hpp b/src/sched/entry/l0/l0_entry.hpp
index ed2598f11..7bcb1add3 100644
--- a/src/sched/entry/l0/l0_entry.hpp
+++ b/src/sched/entry/l0/l0_entry.hpp
@@ -30,8 +30,8 @@
 #include "common/global/global.hpp"
 #include "common/stream/stream.hpp"
 
-#include "common/comm/l0/context/scaling_ctx/ipc_session_key.hpp"
-#include "common/comm/l0/context/scaling_ctx/observer_session_key.hpp"
+#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
+#include "common/comm/l0/context/scale/base/base_session.hpp"
 
 //TODO L0 Workaround
 #include <unistd.h>
@@ -149,19 +149,16 @@ inline std::string to_string(gpu_entry_state state) {
 }
 
 namespace native {
-template <class kernel_params,
-          class gpu_comm_impl,
+template <class gpu_comm_impl,
           ccl::group_split_type group_id,
           ccl::device_topology_type class_id,
           ccl_coll_type type_op>
 class base_gpu_entry : public sched_entry {
 public:
     using gpu_comm = gpu_comm_impl;
-    using processing_type = typename kernel_params::native_type;
-    using kernel_main_typed =
-        typename gpu_comm::template gpu_kernel_t<type_op, group_id, class_id, kernel_params>;
-    using kernel_ipc_typed = typename ccl_ipc_gpu_comm::
-        template gpu_kernel_t<type_op, group_id, class_id, kernel_params>;
+    using kernel_main_typed = typename gpu_comm::template gpu_kernel_t<type_op, group_id, class_id>;
+    using kernel_ipc_typed =
+        typename ccl_ipc_gpu_comm::template gpu_kernel_t<type_op, group_id, class_id>;
 
     template <class elem_t>
     using device_memory = memory<elem_t, ccl_device, ccl_context>;
@@ -188,14 +185,14 @@ class base_gpu_entry : public sched_entry {
                    std::shared_ptr<gpu_comm> comm,
                    ccl_driver_context_ptr in_ctx,
                    const ccl_buffer send_buf,
-                   ccl::datatype dtype_in,
+                   const coll_param_gpu &params,
                    std::shared_ptr<ccl_stream> &stream)
             : sched_entry(sched),
               parent_communicator(comm),
               comm_addr(parent_communicator
                             ->template get_comm_data<get_topology(), get_topology_class()>()),
               send_buf(send_buf),
-              dtype(dtype_in),
+              params(params),
               device_stream(stream),
               ctx(in_ctx),
               entry_state(gpu_entry_state::initial),
@@ -209,10 +206,8 @@ class base_gpu_entry : public sched_entry {
     }
 
     kernel_main_typed &get_local_kernel() noexcept {
-        return parent_communicator->template get_gpu_kernel<type(),
-                                                            get_topology(),
-                                                            get_topology_class(),
-                                                            kernel_params>();
+        return parent_communicator
+            ->template get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
     }
 
     virtual ~base_gpu_entry() {}
@@ -234,11 +229,11 @@ class base_gpu_entry : public sched_entry {
 
         //set kernel args for main kernel on current device
         kernel_main_typed &main_entry_function =
-            parent_communicator->template register_entry<kernel_params, group_id, class_id>(*this);
+            parent_communicator->template register_entry<group_id, class_id>(*this);
 
-        auto send_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type *>(send_buf.get_ptr());
+        auto send_buf_ptr = send_buf.get_ptr();
 
+        //bind data
         main_entry_function.template set_args<typename kernel_main_typed::common_entry_buf_arg>(
             send_buf_ptr);
 
@@ -254,6 +249,7 @@ class base_gpu_entry : public sched_entry {
 
     virtual void update() override {
         if (!ready_to_exec) {
+            // TODO: what if submit_for_execution() return false?
             submit_for_execution();
         }
         else {
@@ -262,7 +258,17 @@ class base_gpu_entry : public sched_entry {
 
             ENTRY_LOG_TRACE(" waiting for finished execution, queue: ", cmd_queue.get());
 
-            ze_result_t ret = get_fence_impl().query_status();
+            ze_result_t ret;
+
+            // Quering fence doesn't sync kernel output with the host, so if we need this
+            // we use QuerySyncronize API.
+            if (ccl::global_data::env().comm_kernels_debug == 0) {
+                ret = get_fence_impl().query_status();
+            }
+            else {
+                ret = zeCommandQueueSynchronize(cmd_queue.get(), 0);
+            }
+
             ENTRY_LOG_TRACE(
                 "Fence query status: ", native::to_string(ret), ", queue: ", cmd_queue.get());
             if (ret == ZE_RESULT_SUCCESS) {
@@ -300,7 +306,13 @@ class base_gpu_entry : public sched_entry {
 
     //USE GPU cache binding
     virtual std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() = 0;
-    virtual observer::invoke_params<type(), kernel_params> get_numa_data() {
+    virtual observer::invoke_params<type()> get_numa_data() {
+        //TODO make pure-virtual
+        ENTRY_LOG_ERROR("NOT implemented for that collective type");
+        abort();
+    }
+
+    virtual observer::invoke_params<type()> get_scaleout_data() {
         //TODO make pure-virtual
         ENTRY_LOG_ERROR("NOT implemented for that collective type");
         abort();
@@ -314,11 +326,19 @@ class base_gpu_entry : public sched_entry {
         return native::observer::session_key{ this };
     }
 
+    virtual native::observer::session_key get_scaleout_session_key() const {
+        return native::observer::session_key{ this };
+    }
+
+    const coll_param_gpu &get_params() const {
+        return params;
+    }
+
 protected:
     size_t get_work_group_size(size_t buffer_size, ccl_device &device) {
         size_t group_size;
         size_t val_vector_size;
-        auto dtype = ccl::native_type_info<typename kernel_params::native_type>::dtype;
+        auto dtype = params.get_datatype();
 
         if (ccl::global_data::env().gpu_thread_count != CCL_ENV_SIZET_NOT_SPECIFIED) {
             group_size = ccl::global_data::env().gpu_thread_count;
@@ -431,9 +451,12 @@ class base_gpu_entry : public sched_entry {
         assert(this->get_state() != gpu_entry_state::wait_for_completion);
 
         if (get_topology() == ccl::group_split_type::cluster) {
-            // TODO: implement process communicator case
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 "TODO: implement process communicator case");
+            // TODO: in case of (vitual device + IPC) we can get the data race here
+            // How we can detect such case?
+            // In the case when we use one GPU queue per process, everything should be ok
+            // throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+            //                      "TODO: implement process communicator case");
+            cmd_list.close_and_execute(get_ctx(), this->get_fence());
         }
         else {
             // TODO: how to ensure that fence update is thread safe?
@@ -502,7 +525,10 @@ class base_gpu_entry : public sched_entry {
     std::shared_ptr<gpu_comm> parent_communicator;
     topology_addr<group_id, class_id> comm_addr;
     ccl_buffer send_buf;
-    ccl::datatype dtype;
+    coll_param_gpu params;
+
+    // TODO: we don't need dtype anymore?
+    // ccl::datatype dtype;
     atl_req_t req{};
     std::shared_ptr<ccl_stream> device_stream;
     // GPU
@@ -535,7 +561,8 @@ class base_gpu_entry : public sched_entry {
     static std::unique_ptr<base_connector_interface<kernel_main_typed>>
     create_kernel_router_for_rank(executor &exec,
                                   int next_rank,
-                                  specific_indexed_device_storage &group_devices) {
+                                  specific_indexed_device_storage &group_devices,
+                                  const coll_param_gpu &params) {
         std::unique_ptr<base_connector_interface<kernel_main_typed>> kernel_router;
         while (!kernel_router) {
             //Gather data from in-process GPU
@@ -548,10 +575,10 @@ class base_gpu_entry : public sched_entry {
 
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
 
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with real device
             kernel_router.reset(
@@ -570,10 +597,10 @@ class base_gpu_entry : public sched_entry {
 
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
 
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
             kernel_router.reset(
                 new kernel_connector<kernel_main_typed, executor, right_kernel_main_type>(
                     exec, right_main_func));
@@ -590,14 +617,14 @@ class base_gpu_entry : public sched_entry {
             }
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
             /*std::shared_ptr<thread_group_comm_device> gpu = map_devices.find(next_rank);
             if(gpu == nullptr)
             {
                 break; // not ready yet!
             }*/
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with real device from another thread
             kernel_router.reset(
@@ -616,7 +643,7 @@ class base_gpu_entry : public sched_entry {
             }
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
             /*
             std::shared_ptr<thread_group_comm_device> gpu = map_devices.find(next_rank);
             if(gpu == nullptr)
@@ -624,7 +651,7 @@ class base_gpu_entry : public sched_entry {
                 break; // not ready yet!
             }*/
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with virtual device from another thread
             kernel_router.reset(
@@ -643,9 +670,9 @@ class base_gpu_entry : public sched_entry {
             }
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with real device from another thread
             kernel_router.reset(
@@ -665,9 +692,9 @@ class base_gpu_entry : public sched_entry {
 
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with virtual device from another thread
             kernel_router.reset(
@@ -686,9 +713,9 @@ class base_gpu_entry : public sched_entry {
             }
             std::shared_ptr<right_gpu_type> gpu = it->second;
             using right_kernel_main_type = typename right_gpu_type::
-                template gpu_kernel_t<type(), get_topology(), get_topology_class(), kernel_params>;
+                template gpu_kernel_t<type(), get_topology(), get_topology_class()>;
             right_kernel_main_type &right_main_func =
-                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class(), kernel_params>();
+                gpu->get_gpu_kernel<type(), get_topology(), get_topology_class()>(params);
 
             //communicate with virtual device from another thread
             kernel_router.reset(
diff --git a/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp b/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp
index 85afe5dde..2e5f1b616 100644
--- a/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_reduce_scatter_typed_entry.hpp
@@ -19,12 +19,13 @@
 
 #include "sched/entry/l0/l0_entry.hpp"
 
+#include "kernels/shared.h"
+
 //TODO L0 Workaround
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
-                                                            gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_reduce_scatter_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                             topology,
                                                             ccl::device_topology_type::ring,
                                                             ccl_coll_reduce_scatter> {
@@ -32,8 +33,7 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
+    using base = base_gpu_entry<gpu_comm_impl,
                                 topology,
                                 ccl::device_topology_type::ring,
                                 ccl_coll_reduce_scatter>;
@@ -45,16 +45,14 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
     using base::kernel_router;
     using base::get_ctx;
     using base::get_local_kernel;
+    using kernel_main_typed = ring::reduce_scatter::main_kernel;
 
-    using kernel_main_typed = ring_reduce_scatter_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_reduce_scatter_ipc<kernel_params>;
-
-    using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
-    using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
-    using local_barrier_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type;
+    using income_data_flag_gpu_type = typename std::remove_pointer<
+        typename ring::reduce_scatter::income_data_flag_arg_type>::type;
+    using ready_to_recv_flag_gpu_type = typename std::remove_pointer<
+        typename ring::reduce_scatter::ready_to_recv_flag_arg_type>::type;
+    using local_barrier_flag_gpu_type = typename std::remove_pointer<
+        typename ring::reduce_scatter::local_barrier_flag_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_REDUCE_SCATTER_TYPED";
@@ -73,27 +71,23 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
         const ccl_buffer send_buf,
         ccl_buffer recv_buf,
         size_t cnt,
-        ccl::reduction op,
+        const coll_param_gpu& params,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   send_buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
-
-              temp_buffer(
-                  this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{},
-                                                   parent_communicator,
-                                                   cnt,
-                                                   get_ctx())),
+            : base(sched, comm, in_ctx, send_buf, params, device_stream),
+
+              temp_buffer(this->template alloc_memory_wrap(
+                  typename ring::reduce_scatter::tmp_recv_buf_arg<uint8_t>{},
+                  parent_communicator,
+                  ring_reduce_scatter_tmp_buffer_size(cnt, base::comm_addr.size) *
+                      ccl::get_datatype_size(params.get_datatype()),
+                  get_ctx())),
               income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
+                  typename ring::reduce_scatter::income_data_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
               ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
+                  typename ring::reduce_scatter::ready_to_recv_flag_arg{},
                   parent_communicator,
                   1,
                   get_ctx())),
@@ -103,13 +97,12 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
                                          sizeof(local_barrier_flag_gpu_type),
                                          get_ctx())) {
         recv_buf_typed_entry = recv_buf;
-        op_typed_entry = op;
         cnt_entry = cnt;
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_reduce_scatter_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_reduce_scatter_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -131,15 +124,15 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
 
         auto& main_entry_function = get_local_kernel();
 
-        auto recv_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr());
+        auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr());
+
         //create implementation specified primitives
         main_entry_function
-            .template set_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                               typename kernel_main_typed::income_data_flag_arg,
-                               typename kernel_main_typed::ready_to_recv_flag_arg,
-                               typename kernel_main_typed::local_barrier_flag_arg,
-                               typename kernel_main_typed::recv_buf_arg,
+            .template set_args<typename ring::reduce_scatter::tmp_recv_buf_arg<void>,
+                               typename ring::reduce_scatter::income_data_flag_arg,
+                               typename ring::reduce_scatter::ready_to_recv_flag_arg,
+                               typename ring::reduce_scatter::local_barrier_flag_arg,
+                               typename ring::reduce_scatter::recv_buf_arg<void>,
                                typename kernel_main_typed::common_entry_buf_size_arg>(
                 temp_buffer.get(),
                 income_data_flag.get(),
@@ -164,9 +157,11 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
+        auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr());
+
         std::vector<ccl_device::device_ipc_memory_handle> ret;
-        ret.reserve(3);
+        ret.reserve(4);
+        ret.push_back(owned_device.create_ipc_memory_handle(recv_buf_ptr, get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
         ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
@@ -179,137 +174,85 @@ class l0_reduce_scatter_typed_entry : public base_gpu_entry<kernel_params,
     }
 
 private:
-    ccl_device::device_memory<typename kernel_params::native_type> temp_buffer;
+    ccl_device::device_memory<> temp_buffer;
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
     ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag;
-    ccl::reduction op_typed_entry;
     ccl_buffer recv_buf_typed_entry;
     size_t cnt_entry;
     std::shared_ptr<ccl_context> ctx;
 
 public:
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::recv_buf_arg,
-                                            typename kernel_main_typed::tmp_recv_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg>();
+            right_kernel
+                .template test_args<typename ring::reduce_scatter::recv_buf_arg<void>,
+                                    typename ring::reduce_scatter::tmp_recv_buf_arg<void>,
+                                    typename ring::reduce_scatter::income_data_flag_arg,
+                                    typename ring::reduce_scatter::ready_to_recv_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
+
         if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::recv_buf_arg::return_t right_output_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::recv_buf_arg>();
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_output_buf_arg,
-                                   typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
+            auto right_output_buf_arg =
+                right_kernel.template get_arg<typename ring::reduce_scatter::recv_buf_arg<void>>();
+            auto right_tmp_recv_buf_arg =
+                right_kernel
+                    .template get_arg<typename ring::reduce_scatter::tmp_recv_buf_arg<void>>();
+            auto right_income_data_flag_arg =
+                right_kernel
+                    .template get_arg<typename ring::reduce_scatter::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel
+                    .template get_arg<typename ring::reduce_scatter::ready_to_recv_flag_arg>();
+
+            // ENTRY_LOG_DEBUG("Bind right arguments from ",
+            //                 right_kernel_t::name(),
+            //                 " kernel",
+            //                 " to ",
+            //                 left_kernel_t::name(),
+            //                 " kernel. "
+            //                 "Right arguments:\n{ ",
+            //                 right_output_buf_arg.first,
+            //                 ", ",
+            //                 right_output_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_tmp_recv_buf_arg.first,
+            //                 ", ",
+            //                 right_tmp_recv_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_income_data_flag_arg.first,
+            //                 ", ",
+            //                 right_income_data_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_ready_to_recv_flag_arg.first,
+            //                 ", ",
+            //                 right_ready_to_recv_flag_arg.second,
+            //                 "}\n");
+
+            left_kernel
+                .template set_args<typename ring::reduce_scatter::right_output_buf_arg<void>,
+                                   typename ring::reduce_scatter::right_tmp_recv_buf_arg<void>,
+                                   typename ring::reduce_scatter::right_income_data_flag_arg,
+                                   typename ring::reduce_scatter::right_ready_to_recv_flag_arg>(
                     right_output_buf_arg.second,
                     right_tmp_recv_buf_arg.second,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
 
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg,
-                                            typename kernel_ipc_typed::income_data_flag_arg,
-                                            typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
         }
         return is_right_kernel_ready;
     }
diff --git a/src/sched/entry/l0/l0_reduce_typed_entry.hpp b/src/sched/entry/l0/l0_reduce_typed_entry.hpp
index a2d96df42..72ea07031 100644
--- a/src/sched/entry/l0/l0_reduce_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_reduce_typed_entry.hpp
@@ -22,9 +22,8 @@
 //TODO L0 Workaround
 
 namespace native {
-template <class kernel_params, class gpu_comm_impl, ccl::group_split_type topology>
-class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
-                                                    gpu_comm_impl,
+template <class gpu_comm_impl, ccl::group_split_type topology>
+class l0_reduce_typed_entry : public base_gpu_entry<gpu_comm_impl,
                                                     topology,
                                                     ccl::device_topology_type::ring,
                                                     ccl_coll_reduce> {
@@ -32,11 +31,8 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
 
-    using base = base_gpu_entry<kernel_params,
-                                gpu_comm_impl,
-                                topology,
-                                ccl::device_topology_type::ring,
-                                ccl_coll_reduce>;
+    using base =
+        base_gpu_entry<gpu_comm_impl, topology, ccl::device_topology_type::ring, ccl_coll_reduce>;
     using base::parent_communicator;
     using base::comm_addr;
     using base::req;
@@ -45,15 +41,16 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
     using base::kernel_router;
     using base::get_ctx;
     using base::get_local_kernel;
-    using kernel_main_typed = ring_reduce_kernel<kernel_params>;
-    using kernel_ipc_typed = ring_reduce_ipc<kernel_params>;
+    using kernel_main_typed = ring::reduce::main_kernel;
+    // TODO: fix type
+    using processing_type = uint8_t;
 
     using income_data_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::income_data_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::reduce::income_data_flag_arg_type>::type;
     using ready_to_recv_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::ready_to_recv_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::reduce::ready_to_recv_flag_arg_type>::type;
     using local_barrier_flag_gpu_type =
-        typename std::remove_pointer<typename kernel_main_typed::local_barrier_flag_arg_type>::type;
+        typename std::remove_pointer<typename ring::reduce::local_barrier_flag_arg_type>::type;
 
     static constexpr const char* class_name() noexcept {
         return "L0_REDUCE_TYPED";
@@ -73,43 +70,39 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
                           size_t cnt,
                           ccl::reduction op,
                           int root,
+                          const coll_param_gpu& params,
                           std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
-            : base(sched,
-                   comm,
-                   in_ctx,
-                   send_buf,
-                   ccl::native_type_info<typename kernel_params::native_type>::dtype,
-                   device_stream),
+            : base(sched, comm, in_ctx, send_buf, params, device_stream),
 
-              temp_buffer(
-                  this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{},
-                                                   parent_communicator,
-                                                   cnt,
-                                                   get_ctx())),
-              income_data_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::income_data_flag_arg{},
+              temp_buffer(this->template alloc_memory_wrap(
+                  typename ring::reduce::tmp_recv_buf_arg<uint8_t>{},
                   parent_communicator,
-                  1,
-                  get_ctx())),
-              ready_to_recv_flag(this->template alloc_memory_wrap(
-                  typename kernel_main_typed::ready_to_recv_flag_arg{},
-                  parent_communicator,
-                  1,
+                  ring_reduce_tmp_buffer_size(cnt, comm_addr.size) *
+                      ccl::get_datatype_size(params.get_datatype()),
                   get_ctx())),
+              income_data_flag(
+                  this->template alloc_memory_wrap(typename ring::reduce::income_data_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
+              ready_to_recv_flag(
+                  this->template alloc_memory_wrap(typename ring::reduce::ready_to_recv_flag_arg{},
+                                                   parent_communicator,
+                                                   1,
+                                                   get_ctx())),
               local_barrier_flag(parent_communicator->get_device()
                                      .template alloc_memory<local_barrier_flag_gpu_type>(
                                          1,
                                          sizeof(local_barrier_flag_gpu_type),
                                          get_ctx())) {
         recv_buf_typed_entry = recv_buf;
-        op_typed_entry = op;
         root_typed_entry = root;
         cnt_entry = cnt;
 
         int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
-            l0_reduce_typed_entry<kernel_params, gpu_comm_impl, topology>>(
-            *this, next_rank, available_devices);
+            l0_reduce_typed_entry<gpu_comm_impl, topology>>(
+            *this, next_rank, available_devices, base::get_params());
 
         ENTRY_LOG_DEBUG("Init phase of current entry for ext_rank:", next_rank);
 
@@ -131,16 +124,15 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
 
         auto& main_entry_function = get_local_kernel();
 
-        auto recv_buf_ptr =
-            reinterpret_cast<typename kernel_params::native_type*>(recv_buf_typed_entry.get_ptr());
+        auto recv_buf_ptr = reinterpret_cast<void*>(recv_buf_typed_entry.get_ptr());
         //create implementation specified primitives
         main_entry_function
-            .template set_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                               typename kernel_main_typed::income_data_flag_arg,
-                               typename kernel_main_typed::ready_to_recv_flag_arg,
-                               typename kernel_main_typed::local_barrier_flag_arg,
-                               typename kernel_main_typed::recv_buf_arg,
-                               typename kernel_main_typed::root_arg,
+            .template set_args<typename ring::reduce::tmp_recv_buf_arg<void>,
+                               typename ring::reduce::income_data_flag_arg,
+                               typename ring::reduce::ready_to_recv_flag_arg,
+                               typename ring::reduce::local_barrier_flag_arg,
+                               typename ring::reduce::recv_buf_arg<void>,
+                               typename ring::reduce::root_arg,
                                typename kernel_main_typed::common_entry_buf_size_arg>(
                 temp_buffer.get(),
                 income_data_flag.get(),
@@ -155,6 +147,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
         this->set_state(gpu_entry_state::wait_for_entry);
 
         //make sure, that kernel ready for launch
+        // TODO: what if submit_for_execution() return false?
         this->submit_for_execution();
         status = ccl_sched_entry_status_started;
     }
@@ -166,7 +159,6 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
     std::vector<ccl_device::device_ipc_memory_handle> get_ipc_data() override {
         ccl_device& owned_device = parent_communicator->get_device();
 
-        //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
         ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
@@ -181,133 +173,72 @@ class l0_reduce_typed_entry : public base_gpu_entry<kernel_params,
     }
 
 private:
-    ccl_device::device_memory<typename kernel_params::native_type> temp_buffer;
+    ccl_device::device_memory<> temp_buffer;
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
     ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag;
-    ccl::reduction op_typed_entry;
     ccl_buffer recv_buf_typed_entry;
     int root_typed_entry;
     size_t cnt_entry;
     std::shared_ptr<ccl_context> ctx;
 
 public:
-    bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
+    template <class left_kernel_t, class right_kernel_t>
+    bool execute(left_kernel_t& left_kernel, right_kernel_t& right_kernel) {
         bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_main_typed::tmp_recv_buf_arg,
-                                            typename kernel_main_typed::income_data_flag_arg,
-                                            typename kernel_main_typed::ready_to_recv_flag_arg>();
-        if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_main_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
+            right_kernel
+                .template test_args<typename ring::reduce::tmp_recv_buf_arg<processing_type>,
+                                    typename ring::reduce::income_data_flag_arg,
+                                    typename ring::reduce::ready_to_recv_flag_arg>();
+
+        // Once we're sure that the parameters ready read them from the right kernel
+        // Note: we not only read the parameters but also reset their 'ready' flag
+        // (since we're using a destructive-copying policy) meaning that they must be stored
+        // in order to be read again.
+        // This is a protection to a case of multiple kernel launches
+        // (i.e. the collective is ran multiple times) where we might read not up-to-date
+        // values from the previous run.
 
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: use array-version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
-        }
-        return is_right_kernel_ready;
-    }
-
-    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
-        //Check argument binding in kernels for next rank
-        bool is_right_kernel_ready =
-            right_kernel.template test_args<typename kernel_ipc_typed::tmp_recv_buf_arg,
-                                            typename kernel_ipc_typed::income_data_flag_arg,
-                                            typename kernel_ipc_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
-            //TODO do not get arguments sequencially - use array version instead
-            typename kernel_main_typed::tmp_recv_buf_arg::return_t right_tmp_recv_buf_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::tmp_recv_buf_arg>();
-            typename kernel_main_typed::income_data_flag_arg::return_t right_income_data_flag_arg =
-                right_kernel.template get_arg<typename kernel_ipc_typed::income_data_flag_arg>();
-            typename kernel_main_typed::ready_to_recv_flag_arg::return_t
-                right_ready_to_recv_flag_arg =
-                    right_kernel
-                        .template get_arg<typename kernel_ipc_typed::ready_to_recv_flag_arg>();
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
-
-            //TODO register argument for current device kernel: user array version
-            main_entry_function
-                .template set_args<typename kernel_main_typed::right_tmp_recv_buf_arg,
-                                   typename kernel_main_typed::right_income_data_flag_arg,
-                                   typename kernel_main_typed::right_ready_to_recv_flag_arg>(
-                    right_tmp_recv_buf_arg.second,
-                    right_income_data_flag_arg.second,
-                    right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            auto right_tmp_recv_buf_arg =
+                right_kernel.template get_arg<typename ring::reduce::tmp_recv_buf_arg<void>>();
+            auto right_income_data_flag_arg =
+                right_kernel.template get_arg<typename ring::reduce::income_data_flag_arg>();
+            auto right_ready_to_recv_flag_arg =
+                right_kernel.template get_arg<typename ring::reduce::ready_to_recv_flag_arg>();
+
+            // ENTRY_LOG_DEBUG("Bind right arguments from ",
+            //                 right_kernel_t::name(),
+            //                 " kernel",
+            //                 " to ",
+            //                 left_kernel_t::name(),
+            //                 " kernel. "
+            //                 "Right arguments:\n{ ",
+            //                 right_tmp_recv_buf_arg.first,
+            //                 ", ",
+            //                 right_tmp_recv_buf_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_income_data_flag_arg.first,
+            //                 ", ",
+            //                 right_income_data_flag_arg.second,
+            //                 "}\n",
+            //                 "{ ",
+            //                 right_ready_to_recv_flag_arg.first,
+            //                 ", ",
+            //                 right_ready_to_recv_flag_arg.second,
+            //                 "}\n");
+
+            left_kernel.template set_args<typename ring::reduce::right_tmp_recv_buf_arg<void>,
+                                          typename ring::reduce::right_income_data_flag_arg,
+                                          typename ring::reduce::right_ready_to_recv_flag_arg>(
+                right_tmp_recv_buf_arg.second,
+                right_income_data_flag_arg.second,
+                right_ready_to_recv_flag_arg.second);
+
+            ENTRY_LOG_DEBUG("Binding arguments between kernels is complete. ",
+                            "Arguments of the left kernel after binding:\n",
+                            left_kernel.to_string());
         }
         return is_right_kernel_ready;
     }
diff --git a/src/sched/entry/probe_entry.hpp b/src/sched/entry/probe_entry.hpp
index 43a9367e4..3cbc402a1 100644
--- a/src/sched/entry/probe_entry.hpp
+++ b/src/sched/entry/probe_entry.hpp
@@ -17,6 +17,7 @@
 
 #include "sched/entry/entry.hpp"
 #include "sched/sched.hpp"
+#include "sched/queue/queue.hpp"
 
 class probe_entry : public sched_entry {
 public:
diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp
index f93f9c1ab..9c36c633b 100644
--- a/src/sched/entry/recv_reduce_entry.hpp
+++ b/src/sched/entry/recv_reduce_entry.hpp
@@ -122,7 +122,8 @@ class recv_reduce_entry final : public sched_entry {
             ccl_buffer reduce_inout_buf =
                 (result_buf_type == ccl_recv_reduce_local_buf) ? inout_buf : comm_buf;
 
-            ccl::status comp_status = ccl_comp_reduce(reduce_in_buf.get_ptr(bytes),
+            ccl::status comp_status = ccl_comp_reduce(sched,
+                                                      reduce_in_buf.get_ptr(bytes),
                                                       in_cnt,
                                                       reduce_inout_buf.get_ptr(bytes),
                                                       out_cnt,
diff --git a/src/sched/entry/reduce_local_entry.hpp b/src/sched/entry/reduce_local_entry.hpp
index 0eb133e35..0a7b58a74 100644
--- a/src/sched/entry/reduce_local_entry.hpp
+++ b/src/sched/entry/reduce_local_entry.hpp
@@ -48,7 +48,8 @@ class reduce_local_entry : public sched_entry {
         size_t bytes = in_cnt * dtype.size();
         size_t offset = inout_buf.get_offset();
         const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset };
-        ccl::status comp_status = ccl_comp_reduce(in_buf.get_ptr(bytes),
+        ccl::status comp_status = ccl_comp_reduce(sched,
+                                                  in_buf.get_ptr(bytes),
                                                   in_cnt,
                                                   inout_buf.get_ptr(bytes),
                                                   out_cnt,
diff --git a/src/sched/entry/subsched_entry.hpp b/src/sched/entry/subsched_entry.hpp
index dca48e39b..bcfee7697 100644
--- a/src/sched/entry/subsched_entry.hpp
+++ b/src/sched/entry/subsched_entry.hpp
@@ -41,6 +41,7 @@ class subsched_entry : public sched_entry {
         subsched.reset(new ccl_extra_sched(sched->coll_param, sched->sched_id));
         subsched->coll_param.ctype = ccl_coll_internal;
         subsched->set_op_id(this->op_id);
+        subsched->flow_control.set_max_credits(sched->flow_control.get_max_credits());
 
         if (sched->coll_param.ctype == ccl_coll_allreduce ||
             sched->coll_param.ctype == ccl_coll_reduce ||
diff --git a/src/sched/master_sched.cpp b/src/sched/master_sched.cpp
index 7cc0dcefa..61afd234f 100644
--- a/src/sched/master_sched.cpp
+++ b/src/sched/master_sched.cpp
@@ -104,9 +104,43 @@ void ccl_master_sched::prepare_partial_scheds() {
 void ccl_master_sched::sync_partial_scheds() {
     CCL_THROW_IF_NOT(!partial_scheds.empty(), "no partial schedules");
 
-    auto sync_obj = std::make_shared<sync_object>(partial_scheds.size());
+    bool add_sync_entry = false;
+
+    /* ensure all partial schedules have the same add_mode */
+    ccl_sched_add_mode add_mode = partial_scheds[0]->get_add_mode();
+    for (auto& sched : partial_scheds) {
+        CCL_THROW_IF_NOT(sched->get_add_mode() == add_mode,
+                         "unexpected add_mode ",
+                         sched->get_add_mode(),
+                         ", expected ",
+                         add_mode);
+    }
+
+    /* check whether all partial schedules already have sync_entry at the tail */
     for (auto& sched : partial_scheds) {
-        entry_factory::make_entry<sync_entry>(sched.get(), sync_obj);
+        if (sched->entries.empty()) {
+            add_sync_entry = true;
+            break;
+        }
+
+        /* TODO: add enum field into base entry to distinguish different entry types */
+        const char* tail_entry_name = (add_mode == ccl_sched_add_back)
+                                          ? sched->entries.back()->name()
+                                          : sched->entries.front()->name();
+
+        if (tail_entry_name && strcmp(tail_entry_name, "SYNC")) {
+            add_sync_entry = true;
+            break;
+        }
+    }
+
+    /* if at least one partial schedule doesn't have sync entry
+       then sync all partial schedules */
+    if (add_sync_entry) {
+        auto sync_obj = std::make_shared<sync_object>(partial_scheds.size());
+        for (auto& sched : partial_scheds) {
+            entry_factory::make_entry<sync_entry>(sched.get(), sync_obj);
+        }
     }
 }
 
@@ -119,7 +153,7 @@ void ccl_master_sched::dump(std::ostream& out) const {
     ccl_logger::format(out,
                        ", req: ",
                        static_cast<const ccl_request*>(this),
-                       ", worker_sched count: ",
+                       ", partial_scheds size: ",
                        partial_scheds.size());
 
     for (const auto& sched : partial_scheds) {
@@ -209,7 +243,7 @@ ccl_master_sched::ccl_master_sched_ptr ccl_master_sched::create(const ccl_coll_p
 
     if (is_created) {
         sched->set_coll_attr(attr);
-        sched->alloc_buffers_for_sycl_copy();
+        sched->alloc_buffers_for_pre_post_copy();
         LOG_DEBUG("didn't find sched, create new one ",
                   sched,
                   ", type ",
diff --git a/src/sched/queue/flow_control.cpp b/src/sched/queue/flow_control.cpp
new file mode 100644
index 000000000..1c67546b6
--- /dev/null
+++ b/src/sched/queue/flow_control.cpp
@@ -0,0 +1,67 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/log/log.hpp"
+#include "sched/queue/flow_control.hpp"
+
+namespace ccl {
+
+flow_control::flow_control()
+        : max_credits(CCL_MAX_FLOW_CREDITS),
+          min_credits(CCL_MAX_FLOW_CREDITS),
+          credits(CCL_MAX_FLOW_CREDITS) {}
+
+flow_control::~flow_control() {
+    LOG_DEBUG("max used credits: ", (max_credits - min_credits));
+}
+
+void flow_control::set_max_credits(size_t value) {
+    max_credits = min_credits = credits = value;
+}
+
+size_t flow_control::get_max_credits() const {
+    return max_credits;
+}
+
+size_t flow_control::get_credits() const {
+    return credits;
+}
+
+bool flow_control::take_credit() {
+    if (credits) {
+        credits--;
+        CCL_THROW_IF_NOT(
+            credits >= 0, "unexpected credits ", credits, ", max_credits ", max_credits);
+        min_credits = std::min(min_credits, credits);
+        return true;
+    }
+    else {
+        LOG_TRACE("no available credits");
+        return false;
+    }
+}
+
+void flow_control::return_credit() {
+    credits++;
+    CCL_THROW_IF_NOT((credits > 0) && (credits <= max_credits) && (credits > min_credits),
+                     "unexpected credits ",
+                     credits,
+                     ", max_credits ",
+                     max_credits,
+                     ", min_credits ",
+                     min_credits);
+}
+
+} // namespace ccl
diff --git a/src/common/comm/l0/modules/kernel_params.hpp b/src/sched/queue/flow_control.hpp
similarity index 59%
rename from src/common/comm/l0/modules/kernel_params.hpp
rename to src/sched/queue/flow_control.hpp
index 43d850d84..2aced09c1 100644
--- a/src/common/comm/l0/modules/kernel_params.hpp
+++ b/src/sched/queue/flow_control.hpp
@@ -14,15 +14,26 @@
  limitations under the License.
 */
 #pragma once
-#include "coll/algorithms/algorithms_enum.hpp"
 
-template <class type>
-struct kernel_params_default {
-    using native_type = type;
-};
+namespace ccl {
+
+#define CCL_MAX_FLOW_CREDITS 1024
+
+class flow_control {
+public:
+    flow_control();
+    ~flow_control();
 
-template <class native_data_type, ccl_coll_reduction reduction>
-struct kernel_reduction_params_traits : kernel_params_default<native_data_type> {
-    using typename kernel_params_default<native_data_type>::native_type;
-    static constexpr ccl_coll_reduction red_type = reduction;
+    void set_max_credits(size_t value);
+    size_t get_max_credits() const;
+    size_t get_credits() const;
+    bool take_credit();
+    void return_credit();
+
+private:
+    size_t max_credits;
+    size_t min_credits;
+    size_t credits;
 };
+
+} // namespace ccl
diff --git a/src/sched/queue/queue.cpp b/src/sched/queue/queue.cpp
index 1a9a4839d..8654e1470 100644
--- a/src/sched/queue/queue.cpp
+++ b/src/sched/queue/queue.cpp
@@ -77,11 +77,14 @@ ccl_sched_queue::ccl_sched_queue(size_t idx, std::vector<size_t> atl_eps)
 }
 
 ccl_sched_queue::~ccl_sched_queue() {
-    CCL_ASSERT(bins.empty(), "unexpected bins size ", bins.size(), ", expected 0");
+    if (!bins.empty())
+        LOG_WARN("unexpected bins size ", bins.size(), ", expected 0");
 
-    CCL_ASSERT(max_priority == 0, "unexpected max_priority ", max_priority, ", expected 0");
+    if (max_priority != 0)
+        LOG_WARN("unexpected max_priority ", max_priority, ", expected 0");
 
-    CCL_ASSERT(!cached_max_priority_bin);
+    if (cached_max_priority_bin)
+        LOG_WARN("unexpected cached_max_priority_bin");
 }
 
 void ccl_sched_queue::add(ccl_sched* sched) {
diff --git a/src/sched/queue/queue.hpp b/src/sched/queue/queue.hpp
index 010453b48..e25ed9e71 100644
--- a/src/sched/queue/queue.hpp
+++ b/src/sched/queue/queue.hpp
@@ -53,7 +53,7 @@ class ccl_sched_list {
 
     ~ccl_sched_list() {
         if (elems.size() != 0 && !ccl::global_data::get().is_ft_enabled) {
-            LOG_ERROR("unexpected elem_count ", elems.size(), ", expected 0");
+            LOG_WARN("unexpected elem_count ", elems.size(), ", expected 0");
         }
 
         for (size_t i = 0; i < elems.size(); i++) {
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 71a0880e2..2c6080867 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -182,6 +182,10 @@ ccl_request* ccl_sched::start_subsched(ccl_extra_sched* subsched) {
     return subsched->req;
 }
 
+std::vector<ccl::event>& ccl_sched::get_deps() const {
+    return static_cast<ccl_master_sched*>(req)->coll_param.deps;
+}
+
 void ccl_sched::dump(std::ostream& out) const {
     if (!ccl::global_data::env().sched_dump) {
         return;
@@ -195,6 +199,10 @@ void ccl_sched::dump(std::ostream& out) const {
                        entries.size(),
                        ", priority: ",
                        get_priority(),
+                       ", max_flow_credits: ",
+                       flow_control.get_max_credits(),
+                       ", flow_credits: ",
+                       flow_control.get_credits(),
                        "\n");
 
     std::stringstream msg;
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index d8f700cd9..f390bc6bd 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -14,7 +14,9 @@
  limitations under the License.
 */
 #pragma once
+
 #include "sched/sched_base.hpp"
+#include "sched/queue/flow_control.hpp"
 #include "internal_types.hpp"
 
 //todo: sequence diagram
@@ -139,6 +141,8 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
 
     ccl_request* start_subsched(ccl_extra_sched* subsched);
 
+    std::vector<ccl::event>& get_deps() const;
+
     ccl_sched_bin* bin = nullptr; /* valid only during execution */
     ccl_sched_queue* queue = nullptr; /* cached pointer to queue, valid even after execution */
     size_t start_idx = 0; /* index to start */
@@ -159,6 +163,12 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     /* currently applicable for start phase only */
     bool strict_order;
 
+    /*
+      limits number of active entries 
+      mostly makes sense for ATL entries
+    */
+    ccl::flow_control flow_control;
+
     void set_finalize_fn(ccl_sched_finalize_fn_t fn, void* ctx) {
         finalize_fn = fn;
         finalize_fn_ctx = ctx;
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index 5ca552cf8..6803ea257 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -13,8 +13,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/sched_base.hpp"
+#include <numeric>
+
+#include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/coll_param.hpp"
 #include "common/global/global.hpp"
+#include "sched/sched_base.hpp"
 
 std::string to_string(ccl_sched_add_mode mode) {
     switch (mode) {
@@ -32,14 +36,19 @@ void ccl_sched_base::set_coll_attr(const ccl_coll_attr& attr) {
 void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param,
                                                 const ccl_coll_attr& attr) {
 #ifdef CCL_ENABLE_SYCL
+    copy_deps(param.deps, coll_param.deps);
     if (param.stream && param.stream->is_sycl_device_stream()) {
-        coll_param.sycl_buf = static_cast<ccl_sycl_buffer_t*>(param.buf);
-        coll_param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-        coll_param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
+        /* update device buffers only if they are already non-null
+           i.e. were set on previous call */
+        if (coll_param.device_send_buf) {
+            coll_param.device_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
+        }
+        if (coll_param.device_recv_buf) {
+            coll_param.device_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
+        }
     }
     else {
 #endif /* CCL_ENABLE_SYCL */
-        coll_param.buf = param.buf;
         coll_param.send_buf = param.send_buf;
         coll_param.recv_buf = param.recv_buf;
 #ifdef CCL_ENABLE_SYCL
@@ -106,7 +115,7 @@ ccl_buffer ccl_sched_base::alloc_buffer(size_t bytes) {
     CCL_THROW_IF_NOT(bytes > 0, "incorrect buffer size: ", bytes);
 
     ccl_buffer buffer =
-        ccl_buffer(CCL_CALLOC(bytes, "sched_buffer"), bytes, 0, ccl_buffer_type::DIRECT);
+        ccl_buffer(CCL_MALLOC(bytes, "sched_buffer"), bytes, 0, ccl_buffer_type::DIRECT);
     memory.buf_list.emplace_back(buffer, bytes);
     CCL_THROW_IF_NOT(buffer.get_ptr(), "null ptr");
 
@@ -252,81 +261,72 @@ void ccl_sched_base::add_memory_region(atl_mr_t* mr) {
     memory.mr_list.emplace_back(mr);
 }
 
-void ccl_sched_base::alloc_buffers_for_sycl_copy() {
+void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
 #ifdef CCL_ENABLE_SYCL
-
     ccl_coll_param& param = coll_param;
+    param.device_send_buf = param.device_recv_buf = nullptr;
+
     if (!param.stream || (!param.stream->is_sycl_device_stream()))
         return;
 
-    LOG_DEBUG("alloc tmp buffers for D2H and H2D copies, coll_type ",
-              ccl_coll_type_to_str(param.ctype),
-              ", dtype_size ",
-              param.dtype.size(),
-              ", comm_size ",
-              param.comm->size(),
-              ", count ",
-              param.count);
+    // check both recv and send buffers, for some algorithms(i.e. alltoallv) one of them is allowed to
+    // be invalid(i.e. unknown return type) as long as the corresponding count is 0 so we won't dereference it.
+    // TODO: should we add a special handling for case when both buffers are invalid?
+    auto send_ptr_type = sycl::get_pointer_type((void*)param.send_buf,
+                                                param.stream->get_native_stream().get_context());
+    auto recv_ptr_type =
+        sycl::get_pointer_type(param.recv_buf, param.stream->get_native_stream().get_context());
+
+    // TODO: we currently don't correctly handle cases when there are 2 different types at the same time
+    // i.e. device memory for send buffer and shared memory for recv buffer
+    bool should_alloc_buffers = true;
+    if ((send_ptr_type == sycl::usm::alloc::shared || recv_ptr_type == sycl::usm::alloc::shared) ||
+        ((send_ptr_type == sycl::usm::alloc::device || recv_ptr_type == sycl::usm::alloc::device) &&
+         atl_wrapper::attr.out.enable_device_buf)) {
+        should_alloc_buffers = false;
+    }
+
+    if (!should_alloc_buffers) {
+        return;
+    }
 
-    size_t idx, send_count = 0, recv_count = 0;
+    param.device_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
+    param.device_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
 
+    param.send_buf = param.recv_buf = nullptr;
+
+    size_t send_alloc_count = 0, recv_alloc_count = 0;
     switch (param.ctype) {
         case ccl_coll_allgatherv:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-            param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            param.send_buf = alloc_staging_buffer(param.send_count * param.dtype.size()).get_ptr();
-            for (idx = 0; idx < param.comm->size(); idx++)
-                recv_count += param.recv_counts[idx];
-            param.recv_buf = alloc_staging_buffer(recv_count * param.dtype.size()).get_ptr();
+            send_alloc_count = param.send_count;
+            recv_alloc_count =
+                std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0);
             break;
         case ccl_coll_allreduce:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-            param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            param.send_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
-            param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
+            /* use in-place to avoid allocation of extra staging buffer*/
+            send_alloc_count = 0;
+            recv_alloc_count = param.count;
             break;
         case ccl_coll_alltoall:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-            param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            param.send_buf =
-                alloc_staging_buffer(param.count * param.dtype.size() * param.comm->size())
-                    .get_ptr();
-            param.recv_buf =
-                alloc_staging_buffer(param.count * param.dtype.size() * param.comm->size())
-                    .get_ptr();
+            send_alloc_count = recv_alloc_count = param.count * param.comm->size();
             break;
         case ccl_coll_alltoallv:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-            param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            for (idx = 0; idx < param.comm->size(); idx++) {
-                send_count += param.send_counts[idx];
-                recv_count += param.recv_counts[idx];
-            }
-            param.send_buf = alloc_staging_buffer(send_count * param.dtype.size()).get_ptr();
-            param.recv_buf = alloc_staging_buffer(recv_count * param.dtype.size()).get_ptr();
+            send_alloc_count =
+                std::accumulate(param.send_counts, param.send_counts + param.comm->size(), 0);
+            recv_alloc_count =
+                std::accumulate(param.recv_counts, param.recv_counts + param.comm->size(), 0);
             break;
         case ccl_coll_bcast:
-            param.sycl_buf = static_cast<ccl_sycl_buffer_t*>(param.buf);
-            param.buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
+            send_alloc_count = 0;
+            recv_alloc_count = param.count;
             break;
         case ccl_coll_reduce:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)(param.send_buf));
-            param.send_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
-            if (param.comm->rank() == param.root) {
-                param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-                param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
-            }
-            else {
-                param.recv_buf = nullptr;
-            }
+            send_alloc_count = param.count;
+            recv_alloc_count = (param.comm->rank() == param.root) ? param.count : 0;
             break;
         case ccl_coll_reduce_scatter:
-            param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
-            param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            param.send_buf =
-                alloc_staging_buffer(param.count * param.comm->size() * param.dtype.size())
-                    .get_ptr();
-            param.recv_buf = alloc_staging_buffer(param.count * param.dtype.size()).get_ptr();
+            send_alloc_count = param.count * param.comm->size();
+            recv_alloc_count = param.count;
             break;
         case ccl_coll_sparse_allreduce:
             CCL_FATAL("SYCL stream is not supported for sparse_allreduce yet");
@@ -334,6 +334,27 @@ void ccl_sched_base::alloc_buffers_for_sycl_copy() {
             break;
         default: break;
     }
+
+    LOG_DEBUG("alloc tmp buffers for D2H and H2D copies, coll_type ",
+              ccl_coll_type_to_str(param.ctype),
+              ", dtype_size ",
+              param.dtype.size(),
+              ", comm_size ",
+              param.comm->size(),
+              ", count ",
+              param.count);
+
+    if (send_alloc_count) {
+        param.send_buf = alloc_staging_buffer(send_alloc_count * param.dtype.size()).get_ptr();
+    }
+
+    if (recv_alloc_count) {
+        param.recv_buf = alloc_staging_buffer(recv_alloc_count * param.dtype.size()).get_ptr();
+
+        if (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_bcast) {
+            param.send_buf = param.recv_buf;
+        }
+    }
 #endif /* CCL_ENABLE_SYCL */
 }
 
diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp
index 43af2ec8d..80edcb737 100644
--- a/src/sched/sched_base.hpp
+++ b/src/sched/sched_base.hpp
@@ -102,12 +102,16 @@ struct ccl_sched_base {
 
     void add_memory_region(atl_mr_t* mr);
 
-    void alloc_buffers_for_sycl_copy();
+    void alloc_buffers_for_pre_post_copy();
 
     void set_entry_exec_mode(ccl_sched_entry_exec_mode mode) {
         exec_mode = mode;
     }
 
+    ccl_sched_add_mode get_add_mode() {
+        return add_mode;
+    }
+
     void set_add_mode(ccl_sched_add_mode mode) {
         add_mode = mode;
     }
@@ -129,6 +133,10 @@ struct ccl_sched_base {
 protected:
     ~ccl_sched_base() = default;
 
+    ccl_sched_base() {
+        CCL_THROW("unsupported");
+    }
+
     ccl_sched_base(const ccl_coll_param& coll_param) : coll_param(coll_param) {}
 
     void update_id();
diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp
index d225f3aba..ade72119e 100644
--- a/src/unordered_coll/unordered_coll.cpp
+++ b/src/unordered_coll/unordered_coll.cpp
@@ -197,7 +197,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
 
     ccl_coll_entry_param match_id_size_param{};
     match_id_size_param.ctype = ccl_coll_bcast;
-    match_id_size_param.buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t));
+    match_id_size_param.recv_buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t));
     match_id_size_param.count = sizeof(size_t);
     match_id_size_param.dtype = ccl_datatype_int8;
     match_id_size_param.root = CCL_UNORDERED_COLL_COORDINATOR;
@@ -209,14 +209,14 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     /* 2. broadcast match_id_value */
     ccl_coll_entry_param match_id_val_param{};
     match_id_val_param.ctype = ccl_coll_bcast;
-    match_id_val_param.buf = ccl_buffer();
+    match_id_val_param.recv_buf = ccl_buffer();
     match_id_val_param.count = 0;
     match_id_val_param.dtype = ccl_datatype_int8;
     match_id_val_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     match_id_val_param.comm = coll_param.comm;
     auto entry = entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_val_param);
 
-    entry->set_field_fn<ccl_sched_entry_field_buf>(
+    entry->set_field_fn<ccl_sched_entry_field_recv_buf>(
         [](const void* fn_ctx, void* field_ptr) {
             auto ctx = static_cast<ccl_unordered_coll_ctx*>(const_cast<void*>(fn_ctx));
             if (ctx->service_sched->coll_param.comm->rank() != CCL_UNORDERED_COLL_COORDINATOR) {
@@ -244,7 +244,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     /* 3. broadcast reserved comm_id */
     ccl_coll_entry_param reserved_comm_id_param{};
     reserved_comm_id_param.ctype = ccl_coll_bcast;
-    reserved_comm_id_param.buf = ccl_buffer(&ctx->reserved_comm_id, sizeof(ccl_comm_id_t));
+    reserved_comm_id_param.recv_buf = ccl_buffer(&ctx->reserved_comm_id, sizeof(ccl_comm_id_t));
     reserved_comm_id_param.count = sizeof(ccl_comm_id_t);
     reserved_comm_id_param.dtype = ccl_datatype_int8;
     reserved_comm_id_param.root = CCL_UNORDERED_COLL_COORDINATOR;
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index e975583b8..fb6416133 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -14,23 +14,38 @@
 # limitations under the License.
 #
 cmake_minimum_required (VERSION 2.8)
+
+if (POLICY CMP0048)
+    cmake_policy(SET CMP0048 OLD)
+endif (POLICY CMP0048)
+
 file(GLOB sources "*_test.c" "*_test.cpp")
 
-if (NOT DEFINED LP_ENV_DEFINED)
-    include(${PROJECT_SOURCE_DIR}/../../cmake/helpers.cmake)
+set(PROJECT_NAME "oneCCL functional tests")
+project(${PROJECT_NAME})
+
+message(STATUS "FT CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}")
+message(STATUS "FT PROJECT_NAME: ${PROJECT_NAME}")
+
+#set default build type
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# standalone build
+if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/../../cmake)
+    include(${COMMON_CMAKE_DIR}/helpers.cmake)
     set_lp_env()
+    if (COMPUTE_BACKEND)
+        set_compute_backend(${COMMON_CMAKE_DIR})
+    endif()
 endif()
 
 set(SERVICE_SRC
     conf.cpp
-    lp.cpp)
-
-if (DEFINED ENV{CCL_CONFIGURATION})
-    set(CCL_CONFIGURATION "$ENV{CCL_CONFIGURATION}")
-    if(${CCL_CONFIGURATION} STREQUAL "cpu_gpu_dpcpp")
-        set(COMPUTE_BACKEND "dpcpp_level_zero")
-    endif()
-endif()
+    lp.cpp
+    transport.cpp)
 
 if (DEFINED ENV{CCL_ROOT})
     set(CCL_ROOT "$ENV{CCL_ROOT}")
@@ -38,30 +53,31 @@ endif()
 
 set(CCL_INSTALL_TESTS "$ENV{PWD}")
 enable_testing()
-ADD_SUBDIRECTORY (googletest-release-1.8.1/googletest/)
+
+set(GTEST_DIR ${PROJECT_SOURCE_DIR}/../googletest-release-1.8.1/googletest)
+add_subdirectory(${GTEST_DIR} gtest_build)
+set(EXAMPLES_DIR ${PROJECT_SOURCE_DIR}/../../examples)
 
 set(INC_DIRS
-     ${PROJECT_SOURCE_DIR}/tests/functional/googletest-release-1.8.1/googletest/include
-     ${PROJECT_SOURCE_DIR}/tests/functional/googletest-release-1.8.1/googletest/src
-     ${PROJECT_SOURCE_DIR}/include)
+    ${GTEST_DIR}/include
+    ${GTEST_DIR}/src
+    ${EXAMPLES_DIR}/include)
 
 include_directories(${INC_DIRS})
 
-message(STATUS "CCL_ROOT: ${CCL_ROOT}")
-message(STATUS "CCL_CONFIGURATION: ${CCL_CONFIGURATION}")
-message(STATUS "tests/functional LIBFABRIC_LIB_DIR: ${LIBFABRIC_LIB_DIR}")
-message(STATUS "tests/functional LIBFABRIC_INCLUDE_DIR: ${LIBFABRIC_INCLUDE_DIR}")
-message(STATUS "INC_DIRS: ${INC_DIRS}")
-
-#include_directories(${CCL_ROOT}/include/${CCL_CONFIGURATION})
-#link_directories(${CCL_ROOT}/lib/${CCL_CONFIGURATION})
+message(STATUS "FT build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "FT CCL_ROOT: ${CCL_ROOT}")
+message(STATUS "FT INC_DIRS: ${INC_DIRS}")
+message(STATUS "FT COMPUTE_BACKEND: ${COMPUTE_BACKEND}")
 
 if (${CMAKE_VERSION} VERSION_LESS 3.1)
 #cmake version below 3.1 does not support CMAKE_C[XX}_STANDARD flags
 #set manually
+# TODO: Need to handle c++17 option for older cmake
     set(CXX_COMPILER_FLAGS "-std=gnu++11")
     set(C_COMPILER_FLAGS "-std=gnu99")
 endif()
+
 #common release/debug compilation settings
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
@@ -74,11 +90,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Werror -D_G
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g")
-set(CMAKE_CXX_STANDARD 11)
+# C++ standard version is set by set_compute_backend/activate_compute_backend, so no need to set it here
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 if (COMPUTE_BACKEND)
-    activate_compute_backend("${CCL_ROOT}/lib;${PROJECT_SOURCE_DIR}/cmake" ${COMPUTE_BACKEND})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${COMPUTE_BACKEND_LIBRARIES}")
 endif()
@@ -98,7 +113,11 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    target_link_libraries(${executable} PRIVATE m)
+    # w/a for ats with 2 mpi lib, should be fixed
+    if (DEFINED ENV{I_MPI_ROOT})
+        set(I_MPI_ROOT "$ENV{I_MPI_ROOT}")
+    endif()
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL)
     add_test (NAME ${executable} CONFIGURATIONS default COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml)
@@ -145,3 +164,8 @@ endforeach()
 foreach(algo direct; ring)
 add_test (NAME reduce_scatter_${algo} CONFIGURATIONS reduce_scatter_${algo} COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/reduce_scatter_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_scatter_${algo}_report.junit.xml)
 endforeach()
+
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    # right now all regression tests require dpcpp, might be changed in the future
+    add_subdirectory(regression)
+endif()
diff --git a/tests/functional/allgatherv_test.cpp b/tests/functional/allgatherv_test.cpp
index f4fccd792..25020ef43 100644
--- a/tests/functional/allgatherv_test.cpp
+++ b/tests/functional/allgatherv_test.cpp
@@ -15,7 +15,7 @@
 */
 #define ALGO_SELECTION_ENV "CCL_ALLGATHERV"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class allgatherv_test : public base_test<T> {
@@ -50,15 +50,12 @@ class allgatherv_test : public base_test<T> {
             offsets[rank] = recv_counts[rank - 1] + offsets[rank - 1];
         }
 
-        if (op.param.place_type == PLACE_OUT) {
-            size_t total_recv_count = std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
-            for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
-                op.recv_bufs[buf_idx].resize(total_recv_count);
-                if (is_lp_datatype(op.param.datatype)) {
-                    op.recv_bufs_lp[buf_idx].resize(total_recv_count);
-                }
-            }
-        }
+        // if (op.param.place_type == PLACE_OUT) {
+        //     size_t total_recv_count = std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+        //     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+        //         op.recv_bufs[buf_idx].resize(total_recv_count);
+        //     }
+        // }
     }
 
     void fill_send_buffers(test_operation<T>& op) {
@@ -99,7 +96,8 @@ class allgatherv_test : public base_test<T> {
                                 recv_buf,
                                 recv_counts,
                                 op.datatype,
-                                global_data::instance().comms[0],
+                                transport_data::instance().get_comm(),
+                                transport_data::instance().get_stream(),
                                 attr));
         }
     }
diff --git a/tests/functional/allreduce_test.cpp b/tests/functional/allreduce_test.cpp
index 200568c5c..146a54458 100644
--- a/tests/functional/allreduce_test.cpp
+++ b/tests/functional/allreduce_test.cpp
@@ -15,7 +15,7 @@
 */
 #define ALGO_SELECTION_ENV "CCL_ALLREDUCE"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class allreduce_test : public base_test<T> {
@@ -49,7 +49,8 @@ class allreduce_test : public base_test<T> {
                                                op.elem_count,
                                                op.datatype,
                                                op.reduction,
-                                               global_data::instance().comms[0],
+                                               transport_data::instance().get_comm(),
+                                               transport_data::instance().get_stream(),
                                                attr));
         }
     }
diff --git a/tests/functional/alltoall_test.cpp b/tests/functional/alltoall_test.cpp
index 5f9bd2cfe..ed7c860af 100644
--- a/tests/functional/alltoall_test.cpp
+++ b/tests/functional/alltoall_test.cpp
@@ -15,7 +15,7 @@
 */
 #define ALGO_SELECTION_ENV "CCL_ALLTOALL"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class alltoall_test : public base_test<T> {
@@ -59,7 +59,8 @@ class alltoall_test : public base_test<T> {
                                               recv_buf,
                                               op.elem_count,
                                               op.datatype,
-                                              global_data::instance().comms[0],
+                                              transport_data::instance().get_comm(),
+                                              transport_data::instance().get_stream(),
                                               attr));
         }
     }
diff --git a/tests/functional/alltoallv_test.cpp b/tests/functional/alltoallv_test.cpp
index 4b4c2ca02..0d4565865 100644
--- a/tests/functional/alltoallv_test.cpp
+++ b/tests/functional/alltoallv_test.cpp
@@ -15,7 +15,7 @@
 */
 #define ALGO_SELECTION_ENV "CCL_ALLTOALLV"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class alltoallv_test : public base_test<T> {
@@ -94,7 +94,8 @@ class alltoallv_test : public base_test<T> {
                                                recv_buf,
                                                recv_counts,
                                                op.datatype,
-                                               global_data::instance().comms[0],
+                                               transport_data::instance().get_comm(),
+                                               transport_data::instance().get_stream(),
                                                attr));
         }
     }
diff --git a/tests/functional/bcast_test.cpp b/tests/functional/bcast_test.cpp
index 0b7e3eb0b..8472d7f96 100644
--- a/tests/functional/bcast_test.cpp
+++ b/tests/functional/bcast_test.cpp
@@ -16,7 +16,7 @@
 #define ALGO_SELECTION_ENV "CCL_BCAST"
 #define BCAST_VALUE_COEFF  128
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class bcast_test : public base_test<T> {
@@ -52,7 +52,8 @@ class bcast_test : public base_test<T> {
                                                op.elem_count,
                                                op.datatype,
                                                ROOT_RANK,
-                                               global_data::instance().comms[0],
+                                               transport_data::instance().get_comm(),
+                                               transport_data::instance().get_stream(),
                                                attr));
         }
     }
diff --git a/tests/functional/conf.cpp b/tests/functional/conf.cpp
index 5199d4f31..cee7b2ae3 100644
--- a/tests/functional/conf.cpp
+++ b/tests/functional/conf.cpp
@@ -232,6 +232,12 @@ void init_test_dims() {
 void init_test_params() {
     init_test_dims();
 
+#ifdef CCL_ENABLE_SYCL
+    printf("FUNC_TESTS: CCL_ENABLE_SYCL ON\n");
+#endif
+    printf("FUNC_TESTS: BF16 enabled %d\n", is_bf16_enabled());
+    printf("FUNC_TESTS: FP16 enabled %d\n", is_fp16_enabled());
+
     for (auto data_type = first_data_type; data_type < last_data_type; data_type++) {
         if (should_skip_datatype(data_type))
             continue;
@@ -303,10 +309,9 @@ std::ostream& operator<<(std::ostream& stream, const test_param& param) {
 }
 
 void print_err_message(char* message, std::ostream& output) {
-    ccl::communicator& comm = global_data::instance().comms[0];
+    auto& comm = transport_data::instance().get_service_comm();
     int comm_size = comm.size();
     int comm_rank = comm.rank();
-
     size_t message_len = strlen(message);
     std::vector<size_t> message_lens(comm_size, 0);
     std::vector<size_t> recv_counts(comm_size, 1);
@@ -326,11 +331,3 @@ void print_err_message(char* message, std::ostream& output) {
         output << messages.data();
     }
 }
-
-void mpi_finalize() {
-    int is_finalized = 0;
-    MPI_Finalized(&is_finalized);
-
-    if (!is_finalized)
-        MPI_Finalize();
-}
diff --git a/tests/functional/conf.hpp b/tests/functional/conf.hpp
index 7e04a7999..1d7435fdd 100644
--- a/tests/functional/conf.hpp
+++ b/tests/functional/conf.hpp
@@ -175,4 +175,3 @@ ccl::reduction get_ccl_reduction(const test_param& param);
 void init_test_dims();
 void init_test_params();
 void print_err_message(char* err_message, std::ostream& output);
-void mpi_finalize();
diff --git a/tests/functional/lp.cpp b/tests/functional/lp.cpp
index 1d4b482b9..8e7b28e6e 100644
--- a/tests/functional/lp.cpp
+++ b/tests/functional/lp.cpp
@@ -30,7 +30,6 @@ int is_fp16_enabled() {
                              : "a"(1));
         is_fp16_enabled = (reg[2] & (1 << 29)) >> 29;
     }
-    printf("FUNC_TESTS: FP16 compiler, is_fp16_enabled %d\n", is_fp16_enabled);
     return is_fp16_enabled;
 #else
     printf("FUNC_TESTS: no FP16 compiler\n");
@@ -50,7 +49,6 @@ int is_bf16_enabled() {
         is_bf16_enabled = ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) &
                           ((reg[1] & (1 << 31)) >> 31);
     }
-    printf("FUNC_TESTS: BF16 compiler, is_bf16_enabled %d\n", is_bf16_enabled);
     return is_bf16_enabled;
 #else
     printf("FUNC_TESTS: no BF16 compiler\n");
@@ -97,7 +95,7 @@ void convert_fp16_to_fp32(const void* src, void* dst) {
 void convert_fp32_to_bf16(const void* src, void* dst) {
 #ifdef CCL_BF16_AVX512BF_COMPILER
     if (is_avx512bf_enabled())
-        _mm256_storeu_si256((__m256i*)(dst), _mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
+        _mm256_storeu_si256((__m256i*)(dst), (__m256i)_mm512_cvtneps_pbh(_mm512_loadu_ps(src)));
     else
 #endif
         _mm256_storeu_si256((__m256i*)(dst),
diff --git a/tests/functional/lp.hpp b/tests/functional/lp.hpp
index a34a6d43b..9396d704e 100644
--- a/tests/functional/lp.hpp
+++ b/tests/functional/lp.hpp
@@ -20,8 +20,8 @@
 #endif
 #include <math.h>
 
-#include "base.hpp"
 #include "conf.hpp"
+#include "test.hpp"
 
 template <typename T>
 struct test_operation;
@@ -40,13 +40,13 @@ int is_bf16_enabled();
 int is_avx512bf_enabled();
 
 #ifdef CCL_FP16_TARGET_ATTRIBUTES
-void convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c,avx512f")));
+void convert_fp32_to_fp16(const void* src, void* dst) __attribute__((target("f16c")));
 #else
 void convert_fp32_to_fp16(const void* src, void* dst);
 #endif
 
 #ifdef CCL_FP16_TARGET_ATTRIBUTES
-void convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c,avx512f")));
+void convert_fp16_to_fp32(const void* src, void* dst) __attribute__((target("f16c")));
 #else
 void convert_fp16_to_fp32(const void* src, void* dst);
 #endif
diff --git a/tests/functional/lp_impl.hpp b/tests/functional/lp_impl.hpp
index 0d4c2dd76..d7b6f5173 100644
--- a/tests/functional/lp_impl.hpp
+++ b/tests/functional/lp_impl.hpp
@@ -18,7 +18,7 @@
 template <typename T>
 void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_type dtype) {
     size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256;
-    short tail[FLOATS_IN_M512] = { 0 };
+    short tail[floats_in_reg];
 
     for (size_t i = 0; i < count; i += floats_in_reg) {
         if (i / floats_in_reg == count / floats_in_reg) {
@@ -36,7 +36,7 @@ void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_typ
 template <typename T>
 void convert_lp_to_fp32_arrays(short* lp_buf, T* buf, size_t count, ccl_data_type dtype) {
     size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256;
-    T tail[FLOATS_IN_M512] = { 0 };
+    T tail[floats_in_reg];
 
     for (size_t i = 0; i < count; i += floats_in_reg) {
         if (i / floats_in_reg == count / floats_in_reg) {
@@ -55,16 +55,9 @@ template <typename T>
 void make_lp_prologue(test_operation<T>& op, size_t count) {
     ccl_data_type dtype = op.param.datatype;
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
-        if (op.param.place_type == PLACE_IN) {
-            T* recv_buf_fp32 = op.recv_bufs[buf_idx].data();
-            short* recv_bufs_lp = op.recv_bufs_lp[buf_idx].data();
-            convert_fp32_to_lp_arrays(recv_buf_fp32, recv_bufs_lp, count, dtype);
-        }
-        else {
-            T* send_buf_fp32 = op.send_bufs[buf_idx].data();
-            short* send_bufs_lp = op.send_bufs_lp[buf_idx].data();
-            convert_fp32_to_lp_arrays(send_buf_fp32, send_bufs_lp, count, dtype);
-        }
+        T* buf = (op.param.place_type == PLACE_IN) ? op.recv_bufs[buf_idx].data()
+                                                   : op.send_bufs[buf_idx].data();
+        convert_fp32_to_lp_arrays(buf, (short*)buf, count, dtype);
     }
 }
 
@@ -72,8 +65,7 @@ template <typename T>
 void make_lp_epilogue(test_operation<T>& op, size_t count) {
     ccl_data_type dtype = op.param.datatype;
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
-        T* recv_buf_fp32 = op.recv_bufs[buf_idx].data();
-        short* recv_bufs_lp = op.recv_bufs_lp[buf_idx].data();
-        convert_lp_to_fp32_arrays(recv_bufs_lp, recv_buf_fp32, count, dtype);
+        std::vector<T> tmp(op.recv_bufs[buf_idx]);
+        convert_lp_to_fp32_arrays((short*)tmp.data(), op.recv_bufs[buf_idx].data(), count, dtype);
     }
 }
diff --git a/tests/functional/reduce_scatter_test.cpp b/tests/functional/reduce_scatter_test.cpp
index 4c0836060..14950c444 100644
--- a/tests/functional/reduce_scatter_test.cpp
+++ b/tests/functional/reduce_scatter_test.cpp
@@ -15,13 +15,13 @@
 */
 #define ALGO_SELECTION_ENV "CCL_REDUCE_SCATTER"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class reduce_scatter_test : public base_test<T> {
 public:
     int check(test_operation<T>& op) {
-        size_t my_rank = global_data::instance().comms[0].rank();
+        int my_rank = transport_data::instance().get_rank();
         for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
             for (size_t elem_idx = 0; elem_idx < op.elem_count;
                  elem_idx += op.get_check_step(elem_idx)) {
@@ -52,7 +52,8 @@ class reduce_scatter_test : public base_test<T> {
                                     op.elem_count,
                                     op.datatype,
                                     op.reduction,
-                                    global_data::instance().comms[0],
+                                    transport_data::instance().get_comm(),
+                                    transport_data::instance().get_stream(),
                                     attr));
         }
     }
diff --git a/tests/functional/reduce_test.cpp b/tests/functional/reduce_test.cpp
index 91d8534d7..0fdf5ea0c 100644
--- a/tests/functional/reduce_test.cpp
+++ b/tests/functional/reduce_test.cpp
@@ -15,7 +15,7 @@
 */
 #define ALGO_SELECTION_ENV "CCL_REDUCE"
 
-#include "base_impl.hpp"
+#include "test_impl.hpp"
 
 template <typename T>
 class reduce_test : public base_test<T> {
@@ -53,7 +53,8 @@ class reduce_test : public base_test<T> {
                                             op.datatype,
                                             op.reduction,
                                             ROOT_RANK,
-                                            global_data::instance().comms[0],
+                                            transport_data::instance().get_comm(),
+                                            transport_data::instance().get_stream(),
                                             attr));
         }
     }
diff --git a/tests/functional/regression/CMakeLists.txt b/tests/functional/regression/CMakeLists.txt
new file mode 100644
index 000000000..c17eac8a3
--- /dev/null
+++ b/tests/functional/regression/CMakeLists.txt
@@ -0,0 +1,30 @@
+#
+# Copyright 2016-2020 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+set(sources alltoallv_empty_count.cpp)
+
+set(CCL_INSTALL_TESTS "${CMAKE_CURRENT_BINARY_DIR}")
+
+message(WARNING $"TEST DIR: ${CCL_INSTALL_TESTS}")
+
+foreach(src ${sources})
+    get_filename_component(executable ${src} NAME_WE)
+    add_executable(${executable} ${src})
+    target_link_libraries(${executable} PRIVATE ccl gtest_main gtest mpi)
+
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL)
+    add_test (NAME ${executable} CONFIGURATIONS regression COMMAND mpiexec.hydra -l -n 3 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml)
+
+endforeach(src ${sources})
diff --git a/tests/functional/regression/alltoallv_empty_count.cpp b/tests/functional/regression/alltoallv_empty_count.cpp
new file mode 100644
index 000000000..23e8409d8
--- /dev/null
+++ b/tests/functional/regression/alltoallv_empty_count.cpp
@@ -0,0 +1,163 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+#include "oneapi/ccl.hpp"
+#include "gtest/gtest.h"
+#include "mpi.h"
+
+class alltoallv_test : public ::testing::Test {
+protected:
+    void SetUp() override {
+        ccl::init();
+
+        MPI_Init(NULL, NULL);
+        MPI_Comm_size(MPI_COMM_WORLD, &size);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+
+    void TearDown() override {
+        // Don't do finalize if the case has failed, this
+        // could lead to a deadlock due to inconsistent state.
+        if (HasFatalFailure()) {
+            return;
+        }
+
+        int is_finalized = 0;
+        MPI_Finalized(&is_finalized);
+
+        if (!is_finalized)
+            MPI_Finalize();
+    }
+
+    int size;
+    int rank;
+};
+
+// there are 3 ranks, rank 0 is able to send and receive data to/from others(its send and receive total count > 0)
+// rank 1 only sends data but not receives it(its recv_count == 0 for all ranks), and rank 2 only receives data but
+// not sends it.
+// also rank 1 sets its recv_buf to nullptr(it's not used anyway due to 0 recv count), the same is done on rank 2 for send buf
+// in the testcase we simply run alltoallv with these parameters and after that check that both rank 0 and rank 2 received
+// the correct data.
+// TODO: once we add more tests, move some common parts out of this test
+TEST_F(alltoallv_test, alltoallv_empty_recv_count) {
+    const size_t count = 1000;
+
+    int i = 0;
+
+    ASSERT_EQ(size, 3) << "Test expects 3 ranks";
+
+    sycl::queue q;
+    ASSERT_TRUE(q.get_device().is_gpu())
+        << "Test expects gpu device, please use SYCL_DEVICE_FILTER accordingly";
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, /*kvs*/ {});
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    // TODO: find a proper way to choose between shared and device pointers(i.e. env variable)
+    /* create buffers */
+    auto send_buf = sycl::malloc_device<int>(count * size, q);
+    auto recv_buf = sycl::malloc_device<int>(count * size, q);
+
+    // we have 2 ranks in total: rank 1 doesn't receive anything, rank 2 - doesn't send anything
+    int empty_recv_rank = 1;
+    int empty_send_rank = 2;
+
+    std::vector<size_t> send_counts(size, count);
+    std::vector<size_t> recv_counts(size, count);
+
+    // update counts so the corresponding rank doesn't receive anything and others doesn't send anything to it
+    send_counts[empty_recv_rank] = 0;
+    if (rank == empty_recv_rank) {
+        std::fill(recv_counts.begin(), recv_counts.end(), 0);
+    }
+
+    recv_counts[empty_send_rank] = 0;
+    if (rank == empty_send_rank) {
+        std::fill(send_counts.begin(), send_counts.end(), 0);
+    }
+    q.memset(recv_buf, 0, count * size).wait();
+
+    std::vector<sycl::event> events;
+    size_t offset = 0;
+    for (int i = 0; i < send_counts.size(); ++i) {
+        auto e = q.submit([&](auto& h) {
+            h.parallel_for(send_counts[i], [=](auto id) {
+                send_buf[id + offset] = i + 1;
+            });
+        });
+        offset += send_counts[i];
+        events.push_back(e);
+    }
+
+    // do not wait completion of kernel and provide it as dependency for operation
+    std::vector<ccl::event> deps;
+    for (auto e : events) {
+        deps.push_back(ccl::create_event(e));
+    }
+
+    // invoke alltoall
+    auto attr = ccl::create_operation_attr<ccl::alltoallv_attr>();
+    int* invalid_ptr = (int*)0x00ffff;
+    // pass an invalid pointer to make sure it's correctly handled and not dereferenced due to 0 count
+    if (rank == empty_recv_rank) {
+        recv_buf = invalid_ptr;
+    }
+    else if (rank == empty_send_rank) {
+        send_buf = invalid_ptr;
+    }
+
+    ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream, attr, deps).wait();
+
+    // if our rank is the one that didn't receive anything, than just exit and don't do any checking
+    if (rank == empty_recv_rank)
+        return;
+
+    size_t total_recv = std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+
+    sycl::buffer<int> check_buf(count * size);
+    q.submit([&](auto& h) {
+         sycl::accessor check_buf_acc(check_buf, h, sycl::write_only);
+         h.parallel_for(total_recv, [=, rnk = rank](auto id) {
+             // we expect that size - 1 chunks are properly filled with data and the last one is
+             // unchanged as we have one rank that doesn't send anything
+             if (recv_buf[id] != rnk + 1) {
+                 check_buf_acc[id] = -1;
+             }
+             else {
+                 check_buf_acc[id] = 0;
+             }
+         });
+     }).wait_and_throw();
+
+    /* print out the result of the test on the host side */
+    {
+        sycl::host_accessor check_buf_acc(check_buf, sycl::read_only);
+        for (i = 0; i < total_recv; i++) {
+            ASSERT_NE(check_buf_acc[i], -1) << "Check failed for receive buffer";
+        }
+    }
+
+    return;
+}
diff --git a/tests/functional/base.hpp b/tests/functional/test.hpp
similarity index 78%
rename from tests/functional/base.hpp
rename to tests/functional/test.hpp
index b42f61fe1..c8c9e01c1 100644
--- a/tests/functional/base.hpp
+++ b/tests/functional/test.hpp
@@ -23,24 +23,7 @@
 
 #include "oneapi/ccl.hpp"
 #include "conf.hpp"
-
-class global_data {
-public:
-    std::vector<ccl::communicator> comms;
-    ccl::shared_ptr_class<ccl::kvs> kvs;
-
-    global_data(global_data& gd) = delete;
-    void operator=(const global_data&) = delete;
-    static global_data& instance() {
-        static global_data gd;
-        return gd;
-    }
-
-protected:
-    global_data(){};
-    ~global_data(){};
-};
-
+#include "transport.hpp"
 #include "utils.hpp"
 
 bool is_lp_datatype(ccl_data_type dtype);
@@ -62,9 +45,10 @@ struct test_operation {
     std::vector<std::vector<T>> send_bufs;
     std::vector<std::vector<T>> recv_bufs;
 
-    // buffers for 16-bits low precision datatype
-    std::vector<std::vector<short>> send_bufs_lp;
-    std::vector<std::vector<short>> recv_bufs_lp;
+#ifdef CCL_ENABLE_SYCL
+    std::vector<void*> device_send_bufs;
+    std::vector<void*> device_recv_bufs;
+#endif /* CCL_ENABLE_SYCL */
 
     std::vector<ccl::event> events;
     ccl::string_class match_id;
@@ -75,8 +59,8 @@ struct test_operation {
               buffer_count(get_buffer_count(param)),
               datatype(get_ccl_datatype(param)),
               reduction(get_ccl_reduction(param)) {
-        comm_size = global_data::instance().comms[0].size();
-        comm_rank = global_data::instance().comms[0].rank();
+        comm_size = transport_data::instance().get_comm().size();
+        comm_rank = transport_data::instance().get_comm().rank();
         buf_indexes.resize(buffer_count);
     }
 
@@ -84,9 +68,7 @@ struct test_operation {
     void prepare_attr(coll_attr_type& coll_attr, size_t idx);
 
     std::string create_match_id(size_t buf_idx);
-    void change_buffer_pointers();
     size_t generate_priority_value(size_t buf_idx);
-
     void define_start_order(std::default_random_engine& rand_engine);
 
     bool complete_events();
@@ -99,17 +81,19 @@ struct test_operation {
     void print(std::ostream& output);
 
     void* get_send_buf(size_t buf_idx) {
-        if (is_lp_datatype(param.datatype))
-            return static_cast<void*>(send_bufs_lp[buf_idx].data());
-        else
-            return static_cast<void*>(send_bufs[buf_idx].data());
+#ifdef CCL_ENABLE_SYCL
+        return device_send_bufs[buf_idx];
+#else /* CCL_ENABLE_SYCL */
+        return send_bufs[buf_idx].data();
+#endif /* CCL_ENABLE_SYCL */
     }
 
     void* get_recv_buf(size_t buf_idx) {
-        if (is_lp_datatype(param.datatype))
-            return static_cast<void*>(recv_bufs_lp[buf_idx].data());
-        else
-            return static_cast<void*>(recv_bufs[buf_idx].data());
+#ifdef CCL_ENABLE_SYCL
+        return device_recv_bufs[buf_idx];
+#else /* CCL_ENABLE_SYCL */
+        return recv_bufs[buf_idx].data();
+#endif /* CCL_ENABLE_SYCL */
     }
 
     size_t get_check_step(size_t elem_idx) {
@@ -136,10 +120,7 @@ struct test_operation {
 template <typename T>
 class base_test {
 public:
-    int global_comm_rank;
-    int global_comm_size;
     char err_message[ERR_MESSAGE_MAX_LEN]{};
-
     std::random_device rand_device;
     std::default_random_engine rand_engine;
 
@@ -151,12 +132,16 @@ class base_test {
 
     void alloc_buffers_base(test_operation<T>& op);
     virtual void alloc_buffers(test_operation<T>& op);
+    void free_buffers(test_operation<T>& op);
 
-    void fill_send_buffers_base(test_operation<T>& op);
     virtual void fill_send_buffers(test_operation<T>& op);
-
-    void fill_recv_buffers_base(test_operation<T>& op);
     virtual void fill_recv_buffers(test_operation<T>& op);
+    void change_buffers(test_operation<T>& op);
+
+#ifdef CCL_ENABLE_SYCL
+    void copy_to_device_send_buffers(test_operation<T>& op);
+    void copy_from_device_recv_buffers(test_operation<T>& op);
+#endif /* CCL_ENABLE_SYCL */
 
     virtual T calculate_reduce_value(test_operation<T>& op, size_t buf_idx, size_t elem_idx);
 
diff --git a/tests/functional/base_impl.hpp b/tests/functional/test_impl.hpp
similarity index 65%
rename from tests/functional/base_impl.hpp
rename to tests/functional/test_impl.hpp
index f8a6bb672..b37a85d0e 100644
--- a/tests/functional/base_impl.hpp
+++ b/tests/functional/test_impl.hpp
@@ -17,12 +17,33 @@
 
 #include <math.h>
 
-#include "base.hpp"
 #include "lp.hpp"
+#include "test.hpp"
+#include "transport.hpp"
 
 #define FIRST_FP_COEFF  (0.1)
 #define SECOND_FP_COEFF (0.01)
 
+#ifdef CCL_ENABLE_SYCL
+void* alloc_buffer(size_t bytes) {
+    auto& allocator = transport_data::instance().get_allocator();
+    return allocator.allocate(bytes, sycl::usm::alloc::device);
+}
+
+void free_buffer(void* ptr) {
+    auto& allocator = transport_data::instance().get_allocator();
+    allocator.deallocate(static_cast<char*>(ptr));
+}
+
+void copy_buffer(void* dst, void* src, size_t bytes) {
+    transport_data::instance().get_stream().get_native().memcpy(dst, src, bytes).wait();
+}
+
+void fill_buffer(void* ptr, int value, size_t bytes) {
+    transport_data::instance().get_stream().get_native().memset(ptr, value, bytes).wait();
+}
+#endif /* CCL_ENABLE_SYCL */
+
 template <typename T>
 template <class coll_attr_type>
 void test_operation<T>::prepare_attr(coll_attr_type& attr, size_t idx) {
@@ -118,24 +139,6 @@ bool test_operation<T>::complete_event(ccl::event& e) {
     }
 }
 
-template <typename T>
-void test_operation<T>::change_buffer_pointers() {
-    char* dynamic_pointer_env = getenv("CCL_TEST_DYNAMIC_POINTER");
-    if (dynamic_pointer_env && atoi(dynamic_pointer_env) == 1) {
-        /*
-            create deep copy of vector with buffers and swap it with original one
-            as result buffers in updated vector will have original content
-            but in new memory locations
-        */
-        if (comm_rank % 2) {
-            std::vector<std::vector<T>>(send_bufs.begin(), send_bufs.end()).swap(send_bufs);
-        }
-        else {
-            std::vector<std::vector<T>>(recv_bufs.begin(), recv_bufs.end()).swap(recv_bufs);
-        }
-    }
-}
-
 template <typename T>
 size_t test_operation<T>::generate_priority_value(size_t buf_idx) {
     return buf_idx++;
@@ -152,8 +155,6 @@ void test_operation<T>::print(std::ostream& output) {
 
 template <typename T>
 base_test<T>::base_test() {
-    global_comm_rank = global_data::instance().comms[0].rank();
-    global_comm_size = global_data::instance().comms[0].size();
     memset(err_message, '\0', ERR_MESSAGE_MAX_LEN);
     rand_engine = std::default_random_engine{ rand_device() };
 }
@@ -204,39 +205,41 @@ int base_test<T>::check_error(test_operation<T>& op, T expected, size_t buf_idx,
     return TEST_SUCCESS;
 }
 
+template <typename T>
+void base_test<T>::free_buffers(test_operation<T>& op) {
+    op.send_bufs.clear();
+    op.recv_bufs.clear();
+
+#ifdef CCL_ENABLE_SYCL
+    for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+        free_buffer(op.device_send_bufs[buf_idx]);
+        free_buffer(op.device_recv_bufs[buf_idx]);
+    }
+#endif /* CCL_ENABLE_SYCL */
+}
+
 template <typename T>
 void base_test<T>::alloc_buffers_base(test_operation<T>& op) {
     op.send_bufs.resize(op.buffer_count);
     op.recv_bufs.resize(op.buffer_count);
-    if (is_lp_datatype(op.param.datatype)) {
-        op.send_bufs_lp.resize(op.buffer_count);
-        op.recv_bufs_lp.resize(op.buffer_count);
-    }
-
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
         op.send_bufs[buf_idx].resize(op.elem_count * op.comm_size);
         op.recv_bufs[buf_idx].resize(op.elem_count * op.comm_size);
+    }
 
-        if (is_lp_datatype(op.param.datatype)) {
-            op.send_bufs_lp[buf_idx].resize(op.elem_count * op.comm_size);
-            op.recv_bufs_lp[buf_idx].resize(op.elem_count * op.comm_size);
-        }
+#ifdef CCL_ENABLE_SYCL
+    op.device_send_bufs.resize(op.buffer_count);
+    op.device_recv_bufs.resize(op.buffer_count);
+    for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+        op.device_send_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size);
+        op.device_recv_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size);
     }
+#endif /* CCL_ENABLE_SYCL */
 }
 
 template <typename T>
 void base_test<T>::alloc_buffers(test_operation<T>& op) {}
 
-template <typename T>
-void base_test<T>::fill_send_buffers_base(test_operation<T>& op) {
-    if (!is_lp_datatype(op.param.datatype))
-        return;
-
-    for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
-        std::fill(op.send_bufs_lp[buf_idx].begin(), op.send_bufs_lp[buf_idx].end(), (T)SOME_VALUE);
-    }
-}
-
 template <typename T>
 void base_test<T>::fill_send_buffers(test_operation<T>& op) {
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
@@ -251,7 +254,7 @@ void base_test<T>::fill_send_buffers(test_operation<T>& op) {
 }
 
 template <typename T>
-void base_test<T>::fill_recv_buffers_base(test_operation<T>& op) {
+void base_test<T>::fill_recv_buffers(test_operation<T>& op) {
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
         if (op.param.place_type == PLACE_IN) {
             std::copy(op.send_bufs[buf_idx].begin(),
@@ -261,15 +264,9 @@ void base_test<T>::fill_recv_buffers_base(test_operation<T>& op) {
         else {
             std::fill(op.recv_bufs[buf_idx].begin(), op.recv_bufs[buf_idx].end(), (T)SOME_VALUE);
         }
-        if (is_lp_datatype(op.param.datatype)) {
-            std::fill(op.recv_bufs_lp[buf_idx].begin(), op.recv_bufs_lp[buf_idx].end(), SOME_VALUE);
-        }
     }
 }
 
-template <typename T>
-void base_test<T>::fill_recv_buffers(test_operation<T>& op) {}
-
 template <typename T>
 T base_test<T>::calculate_reduce_value(test_operation<T>& op, size_t buf_idx, size_t elem_idx) {
     T expected = 0;
@@ -329,50 +326,142 @@ float base_test<float>::calculate_reduce_value(test_operation<float>& op,
     return expected;
 }
 
+template <typename T>
+void base_test<T>::change_buffers(test_operation<T>& op) {
+    char* dynamic_pointer_env = getenv("CCL_TEST_DYNAMIC_POINTER");
+    if (dynamic_pointer_env && atoi(dynamic_pointer_env) == 1) {
+        void* send_buf = op.send_bufs[0].data();
+        void* recv_buf = op.recv_bufs[0].data();
+        /*
+            create deep copy of vector with buffers and swap it with original one
+            as result buffers in updated vector will have original content
+            but in new memory locations
+        */
+        std::vector<std::vector<T>>(op.send_bufs.begin(), op.send_bufs.end()).swap(op.send_bufs);
+        std::vector<std::vector<T>>(op.recv_bufs.begin(), op.recv_bufs.end()).swap(op.recv_bufs);
+        void* new_send_buf = op.send_bufs[0].data();
+        void* new_recv_buf = op.recv_bufs[0].data();
+        ASSERT(send_buf != new_send_buf, "send buffers should differ");
+        ASSERT(recv_buf != new_recv_buf, "recv buffers should differ");
+
+#ifdef CCL_ENABLE_SYCL
+        /* do regular reallocation */
+        void* device_send_buf = op.device_send_bufs[0];
+        void* device_recv_buf = op.device_recv_bufs[0];
+        std::vector<void*> new_device_send_bufs(op.buffer_count);
+        std::vector<void*> new_device_recv_bufs(op.buffer_count);
+        for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+            new_device_send_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size);
+            new_device_recv_bufs[buf_idx] = alloc_buffer(op.elem_count * sizeof(T) * op.comm_size);
+        }
+        for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+            free_buffer(op.device_send_bufs[buf_idx]);
+            free_buffer(op.device_recv_bufs[buf_idx]);
+            op.device_send_bufs[buf_idx] = new_device_send_bufs[buf_idx];
+            op.device_recv_bufs[buf_idx] = new_device_recv_bufs[buf_idx];
+        }
+        void* new_device_send_buf = op.device_send_bufs[0];
+        void* new_device_recv_buf = op.device_recv_bufs[0];
+        ASSERT(device_send_buf != new_device_send_buf, "device send buffers should differ");
+        ASSERT(device_recv_buf != new_device_recv_buf, "device recv buffers should differ");
+#endif /* CCL_ENABLE_SYCL */
+    }
+}
+
+#ifdef CCL_ENABLE_SYCL
+
+template <typename T>
+void base_test<T>::copy_to_device_send_buffers(test_operation<T>& op) {
+    for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+#ifdef TEST_CCL_BCAST
+        void* host_buf = op.recv_bufs[buf_idx].data();
+        void* device_buf = op.device_recv_bufs[buf_idx];
+#else /* TEST_CCL_BCAST */
+        void* host_buf = (op.param.place_type == PLACE_IN) ? op.recv_bufs[buf_idx].data()
+                                                           : op.send_bufs[buf_idx].data();
+        void* device_buf = (op.param.place_type == PLACE_IN) ? op.device_recv_bufs[buf_idx]
+                                                             : op.device_send_bufs[buf_idx];
+#endif /* TEST_CCL_BCAST */
+        size_t bytes = op.send_bufs[buf_idx].size() * sizeof(T);
+        copy_buffer(device_buf, host_buf, bytes);
+    }
+}
+
+template <typename T>
+void base_test<T>::copy_from_device_recv_buffers(test_operation<T>& op) {
+    for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
+        copy_buffer(op.recv_bufs[buf_idx].data(),
+                    op.device_recv_bufs[buf_idx],
+                    op.recv_bufs[buf_idx].size() * sizeof(T));
+    }
+}
+#endif /* CCL_ENABLE_SYCL */
+
 template <typename T>
 int base_test<T>::run(test_operation<T>& op) {
-    size_t result = 0;
+    size_t iter = 0, result = 0;
 
     char* algo = getenv(ALGO_SELECTION_ENV);
     if (algo)
         std::cout << ALGO_SELECTION_ENV << " = " << algo << "\n";
     std::cout << op.param << "\n";
 
-    for (size_t iter = 0; iter < ITER_COUNT; iter++) {
-        try {
-            alloc_buffers_base(op);
-            alloc_buffers(op);
-
-            fill_send_buffers_base(op);
-            fill_send_buffers(op);
-
-            fill_recv_buffers_base(op);
-            fill_recv_buffers(op);
-
+    /*
+        Buffer management logic for single operation
+        SYCL-specific logic is marked with (*)
+        LP-specific logic is marked with (**)
+
+        1. alloc host send and recv buffers
+        2. alloc device send and recv buffers (*)
+        3. fill host send and recv buffers
+        4. do in-place FP32->LP cast for host send buffer (**)
+        5. copy from host send buffer into device send buffer (*)
+        6. invoke comm operation on host or device (*) send and recv buffers
+        7. copy device recv buffer into host recv buffer (*)
+        8. do in-place LP->FP32 cast for host recv buffer (**)
+        9. check result correctness on host recv buffer
+        10. free host send and recv buffers
+        11. free device send and recv buffers (*)
+    */
+
+    try {
+        alloc_buffers_base(op);
+        alloc_buffers(op);
+        for (iter = 0; iter < ITER_COUNT; iter++) {
             if (iter > 0) {
-                op.change_buffer_pointers();
+                change_buffers(op);
             }
 
-            op.define_start_order(rand_engine);
+            fill_send_buffers(op);
+            fill_recv_buffers(op);
 
             if (is_lp_datatype(op.param.datatype)) {
                 make_lp_prologue(op, op.comm_size * op.elem_count);
             }
 
-            run_derived(op);
+#ifdef CCL_ENABLE_SYCL
+            copy_to_device_send_buffers(op);
+#endif /* CCL_ENABLE_SYCL */
 
+            op.define_start_order(rand_engine);
+            run_derived(op);
             op.complete_events();
 
+#ifdef CCL_ENABLE_SYCL
+            copy_from_device_recv_buffers(op);
+#endif /* CCL_ENABLE_SYCL */
+
             if (is_lp_datatype(op.param.datatype)) {
                 make_lp_epilogue(op, op.comm_size * op.elem_count);
             }
 
             result += check(op);
         }
-        catch (const std::exception& ex) {
-            result += TEST_FAILURE;
-            printf("WARNING! %s iter number: %zu", ex.what(), iter);
-        }
+        free_buffers(op);
+    }
+    catch (const std::exception& ex) {
+        result += TEST_FAILURE;
+        printf("WARNING! %s iter number: %zu", ex.what(), iter);
     }
 
     return result;
diff --git a/tests/functional/transport.cpp b/tests/functional/transport.cpp
new file mode 100644
index 000000000..b6b29a9bd
--- /dev/null
+++ b/tests/functional/transport.cpp
@@ -0,0 +1,144 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include <mpi.h>
+
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif /* CCL_ENABLE_SYCL */
+
+#include "transport.hpp"
+
+transport_data::transport_data() {
+    init_by_mpi();
+
+    service_comms.push_back(ccl::create_communicator(size, rank, kvs));
+}
+
+transport_data::~transport_data() {
+    deinit_by_mpi();
+}
+
+transport_data& transport_data::instance() {
+    static transport_data inst;
+    return inst;
+}
+
+void transport_data::init_by_mpi() {
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    ccl::shared_ptr_class<ccl::kvs> kvs_candidate;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs_candidate = ccl::create_main_kvs();
+        main_addr = kvs_candidate->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs_candidate = ccl::create_kvs(main_addr);
+    }
+    kvs = kvs_candidate;
+    init_comms();
+}
+
+void transport_data::deinit_by_mpi() {
+    int is_finalized = 0;
+    MPI_Finalized(&is_finalized);
+    if (!is_finalized) {
+        MPI_Finalize();
+    }
+}
+
+void transport_data::init_comms() {
+    std::vector<int> local_ranks;
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        local_ranks.push_back(rank * ranks_per_proc + idx);
+    }
+
+    ccl::context context = ccl::create_context();
+    std::vector<ccl::device> devices;
+    std::map<int, ccl::device> r2d_map;
+
+#ifdef CCL_ENABLE_SYCL
+    auto sycl_queues = create_sycl_queues("gpu", local_ranks);
+    ASSERT(!sycl_queues.empty(), "queues should contain at least one queue");
+    ASSERT(ranks_per_proc == sycl_queues.size(), "ranks and queues sizes should match");
+
+    auto sycl_context = sycl_queues[0].get_context();
+    context = ccl::create_context(sycl_context);
+
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        streams.push_back(ccl::create_stream(sycl_queues[idx]));
+        devices.push_back(ccl::create_device(sycl_queues[idx].get_device()));
+        allocators.push_back(buf_allocator<char>(streams[0].get_native()));
+    }
+#else /* CCL_ENABLE_SYCL */
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        streams.push_back(ccl::create_stream());
+        devices.push_back(ccl::create_device());
+    }
+#endif /* CCL_ENABLE_SYCL */
+
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        r2d_map.emplace(local_ranks[idx], devices[idx]);
+    }
+
+    comms = ccl::create_communicators(size * ranks_per_proc, r2d_map, context, kvs);
+
+    ASSERT((int)comms.size() == ranks_per_proc,
+           "unexpected comms size %zu, expected %d",
+           comms.size(),
+           ranks_per_proc);
+}
+
+void transport_data::reset_comms() {
+    comms.clear();
+    service_comms.clear();
+}
+
+int transport_data::get_rank() const noexcept {
+    return rank;
+}
+
+int transport_data::get_size() const noexcept {
+    return size;
+}
+
+ccl::shared_ptr_class<ccl::kvs> transport_data::get_kvs() {
+    return kvs;
+}
+
+ccl::communicator& transport_data::get_comm() {
+    return comms[0];
+}
+
+ccl::communicator& transport_data::get_service_comm() {
+    return service_comms[0];
+}
+
+ccl::stream& transport_data::get_stream() {
+    return streams[0];
+}
+
+#ifdef CCL_ENABLE_SYCL
+buf_allocator<char>& transport_data::get_allocator() {
+    return allocators[0];
+}
+#endif /* CCL_ENABLE_SYCL */
diff --git a/tests/functional/transport.hpp b/tests/functional/transport.hpp
new file mode 100644
index 000000000..f7287064e
--- /dev/null
+++ b/tests/functional/transport.hpp
@@ -0,0 +1,66 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "base.hpp"
+#include "oneapi/ccl.hpp"
+#ifdef CCL_ENABLE_SYCL
+#include "sycl_base.hpp"
+#endif /* CCL_ENABLE_SYCL */
+
+class transport_data {
+public:
+    static transport_data& instance();
+
+    void init_comms();
+    void reset_comms();
+
+    int get_rank() const noexcept;
+    int get_size() const noexcept;
+
+    ccl::shared_ptr_class<ccl::kvs> get_kvs();
+    ccl::communicator& get_comm();
+    ccl::communicator& get_service_comm();
+    ccl::stream& get_stream();
+
+#ifdef CCL_ENABLE_SYCL
+    buf_allocator<char>& get_allocator();
+#endif /* CCL_ENABLE_SYCL */
+
+private:
+    transport_data();
+    ~transport_data();
+
+    void init_by_mpi();
+    void deinit_by_mpi();
+
+    int rank;
+    int size;
+
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    std::vector<ccl::communicator> comms;
+    std::vector<ccl::communicator> service_comms;
+    std::vector<ccl::stream> streams;
+
+#ifdef CCL_ENABLE_SYCL
+    std::vector<buf_allocator<char>> allocators;
+#endif /* CCL_ENABLE_SYCL */
+
+    const int ranks_per_proc = 1;
+};
diff --git a/tests/functional/utils.hpp b/tests/functional/utils.hpp
index 0298123ca..494381865 100644
--- a/tests/functional/utils.hpp
+++ b/tests/functional/utils.hpp
@@ -22,7 +22,6 @@
 #include <sstream>
 #include <stdlib.h>
 #include <string>
-#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
 
@@ -38,48 +37,24 @@
 #define ITER_COUNT          2
 #define ERR_MESSAGE_MAX_LEN 180
 
-#define TIMEOUT 30
-
-#define GETTID()    syscall(SYS_gettid)
+#define TIMEOUT     30
 #define UNUSED_ATTR __attribute__((unused))
 
 #define TEST_SUCCESS 0
 #define TEST_FAILURE 1
 
-#if 0
-
+#ifndef PRINT
 #define PRINT(fmt, ...) \
     do { \
         fflush(stdout); \
         printf("\n(%ld): %s: " fmt "\n", GETTID(), __FUNCTION__, ##__VA_ARGS__); \
         fflush(stdout); \
     } while (0)
-
-#define PRINT_BUFFER(buf, bufSize, prefix) \
-    do { \
-        std::string strToPrint; \
-        for (size_t idx = 0; idx < bufSize; idx++) { \
-            strToPrint += std::to_string(buf[idx]); \
-            if (idx != bufSize - 1) \
-                strToPrint += ", "; \
-        } \
-        strToPrint = std::string(prefix) + strToPrint; \
-        PRINT("%s", strToPrint.c_str()); \
-    } while (0)
-
-#else /* ENABLE_DEBUG */
-
-#define PRINT(fmt, ...) \
-    {}
-#define PRINT_BUFFER(buf, bufSize, prefix) \
-    {}
-
-#endif /* ENABLE_DEBUG */
+#endif /* PRINT */
 
 #define OUTPUT_NAME_ARG "--gtest_output="
-#define PATCH_OUTPUT_NAME_ARG(argc, argv) \
+#define PATCH_OUTPUT_NAME_ARG(argc, argv, comm) \
     do { \
-        auto& comm = gd.comms[0]; \
         if (comm.size() > 1) { \
             for (int idx = 1; idx < argc; idx++) { \
                 if (strstr(argv[idx], OUTPUT_NAME_ARG)) { \
@@ -115,8 +90,8 @@
         int result = className.run(op); \
         int result_final = 0; \
         static int glob_idx = 0; \
-        auto& comm = global_data::instance().comms[0]; \
-        ccl::allreduce(&result, &result_final, 1, ccl::reduction::sum, comm).wait(); \
+        auto& service_comm = transport_data::instance().get_service_comm(); \
+        ccl::allreduce(&result, &result_final, 1, ccl::reduction::sum, service_comm).wait(); \
         if (result_final > 0) { \
             print_err_message(className.get_err_message(), output); \
             if (op.comm_rank == 0) { \
@@ -139,50 +114,15 @@
         return TEST_SUCCESS; \
     }
 
-#define ASSERT(cond, fmt, ...) \
-    do { \
-        if (!(cond)) { \
-            fprintf(stderr, \
-                    "(%ld): %s:%s:%d: ASSERT '%s' FAILED: " fmt "\n", \
-                    GETTID(), \
-                    __FILE__, \
-                    __FUNCTION__, \
-                    __LINE__, \
-                    #cond, \
-                    ##__VA_ARGS__); \
-            fflush(stderr); \
-            exit(0); \
-        } \
-    } while (0)
-
 #define MAIN_FUNCTION() \
     int main(int argc, char** argv, char* envs[]) { \
         init_test_params(); \
-        ccl::init(); \
-        int mpi_inited = 0; \
-        MPI_Initialized(&mpi_inited); \
-        if (!mpi_inited) { \
-            MPI_Init(NULL, NULL); \
-        } \
-        atexit(mpi_finalize); \
-        int size, rank; \
-        MPI_Comm_size(MPI_COMM_WORLD, &size); \
-        MPI_Comm_rank(MPI_COMM_WORLD, &rank); \
-        ccl::kvs::address_type main_addr; \
-        auto& gd = global_data::instance(); \
-        if (rank == 0) { \
-            gd.kvs = ccl::create_main_kvs(); \
-            main_addr = gd.kvs->get_address(); \
-            MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); \
-        } \
-        else { \
-            MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); \
-            gd.kvs = ccl::create_kvs(main_addr); \
-        } \
-        gd.comms.emplace_back(ccl::create_communicator(size, rank, gd.kvs)); \
-        PATCH_OUTPUT_NAME_ARG(argc, argv); \
+        auto& transport = transport_data::instance(); \
+        auto& service_comm = transport.get_service_comm(); \
+        PATCH_OUTPUT_NAME_ARG(argc, argv, service_comm); \
         testing::InitGoogleTest(&argc, argv); \
         int res = RUN_ALL_TESTS(); \
+        transport.reset_comms(); \
         return res; \
     }
 
diff --git a/tests/functional/googletest-release-1.8.1/CMakeLists.txt b/tests/googletest-release-1.8.1/CMakeLists.txt
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/CMakeLists.txt
rename to tests/googletest-release-1.8.1/CMakeLists.txt
diff --git a/tests/functional/googletest-release-1.8.1/CONTRIBUTING.md b/tests/googletest-release-1.8.1/CONTRIBUTING.md
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/CONTRIBUTING.md
rename to tests/googletest-release-1.8.1/CONTRIBUTING.md
diff --git a/tests/functional/googletest-release-1.8.1/LICENSE b/tests/googletest-release-1.8.1/LICENSE
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/LICENSE
rename to tests/googletest-release-1.8.1/LICENSE
diff --git a/tests/functional/googletest-release-1.8.1/Makefile.am b/tests/googletest-release-1.8.1/Makefile.am
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/Makefile.am
rename to tests/googletest-release-1.8.1/Makefile.am
diff --git a/tests/functional/googletest-release-1.8.1/README.md b/tests/googletest-release-1.8.1/README.md
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/README.md
rename to tests/googletest-release-1.8.1/README.md
diff --git a/tests/functional/googletest-release-1.8.1/googletest/CMakeLists.txt b/tests/googletest-release-1.8.1/googletest/CMakeLists.txt
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/CMakeLists.txt
rename to tests/googletest-release-1.8.1/googletest/CMakeLists.txt
diff --git a/tests/functional/googletest-release-1.8.1/googletest/Makefile.am b/tests/googletest-release-1.8.1/googletest/Makefile.am
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/Makefile.am
rename to tests/googletest-release-1.8.1/googletest/Makefile.am
diff --git a/tests/functional/googletest-release-1.8.1/googletest/README.md b/tests/googletest-release-1.8.1/googletest/README.md
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/README.md
rename to tests/googletest-release-1.8.1/googletest/README.md
diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/Config.cmake.in b/tests/googletest-release-1.8.1/googletest/cmake/Config.cmake.in
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/cmake/Config.cmake.in
rename to tests/googletest-release-1.8.1/googletest/cmake/Config.cmake.in
diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/gtest.pc.in b/tests/googletest-release-1.8.1/googletest/cmake/gtest.pc.in
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/cmake/gtest.pc.in
rename to tests/googletest-release-1.8.1/googletest/cmake/gtest.pc.in
diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in b/tests/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in
rename to tests/googletest-release-1.8.1/googletest/cmake/gtest_main.pc.in
diff --git a/tests/functional/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake b/tests/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake
rename to tests/googletest-release-1.8.1/googletest/cmake/internal_utils.cmake
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-death-test.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-message.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-param-test.h.pump
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-printers.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-spi.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-test-part.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest-typed-test.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest_pred_impl.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h b/tests/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/gtest_prod.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/README.md
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-port.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest-printers.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/custom/gtest.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-death-test-internal.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-filepath.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-internal.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-linked_ptr.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-param-util.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port-arch.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-port.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-string.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-tuple.h.pump
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump b/tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump
rename to tests/googletest-release-1.8.1/googletest/include/gtest/internal/gtest-type-util.h.pump
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-all.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-all.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-all.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-all.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-death-test.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-death-test.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-death-test.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-death-test.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-filepath.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-filepath.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-filepath.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-filepath.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h b/tests/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h
rename to tests/googletest-release-1.8.1/googletest/src/gtest-internal-inl.h
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-port.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-port.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-port.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-port.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-printers.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-printers.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-printers.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-printers.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-test-part.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-test-part.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-test-part.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-test-part.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc b/tests/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest-typed-test.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest.cc b/tests/googletest-release-1.8.1/googletest/src/gtest.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest.cc
diff --git a/tests/functional/googletest-release-1.8.1/googletest/src/gtest_main.cc b/tests/googletest-release-1.8.1/googletest/src/gtest_main.cc
similarity index 100%
rename from tests/functional/googletest-release-1.8.1/googletest/src/gtest_main.cc
rename to tests/googletest-release-1.8.1/googletest/src/gtest_main.cc
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 606400830..274a06f11 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,5 +1,5 @@
 Intel(R) oneAPI Collective Communications Library (oneCCL) 
-2021.2.0 Third Party Programs File
+2021.3.0 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -209,7 +209,56 @@ Software.
 
 -------------------------------------------------------------------------------
 
-5. Googletest
+5. The Portable Hardware Locality (hwloc)
+
+ Copyright © 2004-2006 The Trustees of Indiana University and Indiana University Research and Technology Corporation.  All rights reserved.
+ Copyright © 2004-2005 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved.
+ Copyright © 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart.  All rights reserved.
+ Copyright © 2004-2005 The Regents of the University of California. All rights reserved.
+ Copyright © 2009      CNRS
+ Copyright © 2009-2016 Inria.  All rights reserved.
+ Copyright © 2009-2015 Université Bordeaux
+ Copyright © 2009-2015 Cisco Systems, Inc.  All rights reserved.
+ Copyright © 2009-2012 Oracle and/or its affiliates.  All rights reserved.
+ Copyright © 2010      IBM
+ Copyright © 2010      Jirka Hladky
+ Copyright © 2012      Aleksej Saushev, The NetBSD Foundation
+ Copyright © 2012      Blue Brain Project, EPFL. All rights reserved.
+ Copyright © 2013-2014 University of Wisconsin-La Crosse. All rights reserved.
+ Copyright © 2015      Research Organization for Information Science and Technology (RIST). All rights reserved.
+ Copyright © 2015-2016 Intel, Inc.  All rights reserved.
+ See COPYING in top-level directory.
+
+ The 3-Clause BSD License
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+ - The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------------------
+
+6. Googletest
 
   Copyright 2008, Google Inc.
   All rights reserved.