diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50496efa1..c8e354f61 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 
 option(BUILD_EXAMPLES "Build examples" TRUE)
 option(BUILD_FT "Build functional tests" TRUE)
-option(BUILD_UT "Build unit tests" FALSE)
+option(BUILD_REG_TESTS "Build regression tests" TRUE)
 option(BUILD_CONFIG "Build cmake configs" TRUE)
 option(ENABLE_MPI "Enable MPI for library" TRUE)
 option(ENABLE_MPI_TESTS "Enable MPI for tests" TRUE)
@@ -70,17 +70,16 @@ message(STATUS "C compiler : ${CMAKE_C_COMPILER}")
 message(STATUS "CXX compiler : ${CMAKE_CXX_COMPILER}")
 message(STATUS "Build examples: ${BUILD_EXAMPLES}")
 message(STATUS "Build functional tests: ${BUILD_FT}")
-message(STATUS "Build unit tests: ${BUILD_UT}")
 message(STATUS "Build cmake configs: ${BUILD_CONFIG}")
-message(STATUS "Enable MPI for library: ${ENABLE_MPI}")
-message(STATUS "Enable MPI for tests: ${ENABLE_MPI_TESTS}")
-message(STATUS "Enable support for interop event functionality: ${ENABLE_SYCL_INTEROP_EVENT}")
-message(STATUS "Enable support for OFI HMEM: ${ENABLE_OFI_HMEM}")
+message(STATUS "Enable MPI support: ${ENABLE_MPI}")
+message(STATUS "Enable MPI tests support: ${ENABLE_MPI_TESTS}")
+message(STATUS "Enable SYCL interop event support: ${ENABLE_SYCL_INTEROP_EVENT}")
+message(STATUS "Enable OFI HMEM support: ${ENABLE_OFI_HMEM}")
 
 add_definitions(-DCCL_C_COMPILER="${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 add_definitions(-DCCL_CXX_COMPILER="${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 
-SET(MULTI_GPU_SUPPORT OFF CACHE BOOL "Enable Multi GPU extension support")
+SET(CCL_ENABLE_ZE OFF CACHE BOOL "Enable Level Zero support")
 
 set(CCL_COMMON_INSTALL_PREFIX "intel64")
 set(CMAKE_INSTALL_LIBDIR "lib")
@@ -94,11 +93,8 @@ set(CCL_INSTALL_LICENSE "${CMAKE_INSTALL_PREFIX}/licensing")
 set(CCL_INSTALL_MODULE "${CMAKE_INSTALL_PREFIX}/modulefiles")
 set(CCL_INSTALL_EXAMPLES "${CMAKE_INSTALL_PREFIX}/examples")
 set(CCL_INSTALL_TESTS "${CMAKE_INSTALL_PREFIX}/tests")
-set(CCL_INSTALL_UNIT_TESTS "${CMAKE_INSTALL_PREFIX}/tests/unit")
 set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/kernels")
 
-set(CCL_UNIT_TESTS_BUILD "${CMAKE_BINARY_DIR}/tests/unit")
-
 # setup dependency directories
 set(DEPS_DIR "${PROJECT_SOURCE_DIR}/deps")
 
@@ -133,25 +129,21 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1)
     set(C_COMPILER_FLAGS "-std=gnu99")
 endif()
 
-# TODO: add -Wextra to c/cxx flags
-
 # common release/debug compilation settings
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Wno-implicit-fallthrough -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${C_COMPILER_FLAGS} -O3")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} ${C_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Wno-implicit-fallthrough -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-set(TRY_ENABLE_SYCL_L0 ON)
-
 set(COMMON_CMAKE_DIR ${PROJECT_SOURCE_DIR}/cmake)
 if (COMPUTE_BACKEND)
     message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND}")
@@ -192,6 +184,7 @@ if (WITH_ASAN AND ${CMAKE_BUILD_TYPE_CASE_INSENSITIVE} STREQUAL "debug")
 endif()
 
 set_lp_env()
+set_avx_env()
 
 set(CCL_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src)
 
@@ -227,7 +220,7 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
 endif()
 
 set(CCL_MAJOR_VERSION     "2021")
-set(CCL_MINOR_VERSION     "4")
+set(CCL_MINOR_VERSION     "5")
 set(CCL_UPDATE_VERSION    "0")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
@@ -257,7 +250,9 @@ if (ENABLE_MPI_TESTS)
         add_subdirectory(examples/benchmark)
         add_subdirectory(examples/common)
         add_subdirectory(examples/cpu)
-        add_subdirectory(examples/external_launcher)
+        if (BUILD_CONFIG)
+            add_subdirectory(examples/external_launcher)
+        endif()
         if (CCL_ENABLE_SYCL)
             add_subdirectory(examples/sycl)
         endif()
@@ -265,7 +260,4 @@ if (ENABLE_MPI_TESTS)
     if (BUILD_FT)
         add_subdirectory(tests/functional)
     endif()
-    if (BUILD_UT AND EXISTS "${PROJECT_SOURCE_DIR}/tests/unit")
-        add_subdirectory(tests/unit)
-    endif()
 endif()
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 245de34e1..8bba8e98e 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -88,6 +88,38 @@ function(set_lp_env)
 
 endfunction(set_lp_env)
 
+function(set_avx_env)
+
+    set(GCC_AVX_MIN_SUPPORTED "4.9.0")
+    set(CLANG_AVX_MIN_SUPPORTED "9.0.0")
+
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+        OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
+            AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_AVX_MIN_SUPPORTED})
+        OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU"
+            AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_AVX_MIN_SUPPORTED})
+        )
+        add_definitions(-DCCL_AVX_COMPILER)
+        set(CCL_AVX_COMPILER ON)
+    else()
+        set(CCL_AVX_COMPILER OFF)
+    endif()
+    message(STATUS "AVX compiler: ${CCL_AVX_COMPILER}")
+
+    if (CCL_AVX_COMPILER)
+        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
+            add_definitions(-DCCL_AVX_TARGET_ATTRIBUTES)
+            set(CCL_AVX_TARGET_ATTRIBUTES ON)
+        else()
+            set(CCL_AVX_TARGET_ATTRIBUTES OFF)
+        endif()
+        message(STATUS "AVX target attributes: ${CCL_AVX_TARGET_ATTRIBUTES}")
+    endif()
+
+    set(AVX_ENV_DEFINED 1 PARENT_SCOPE)
+
+endfunction(set_avx_env)
+
 function(check_compiler_version)
 
     set(GCC_MIN_SUPPORTED "4.8")
@@ -293,11 +325,11 @@ function(set_compute_backend COMMON_CMAKE_DIR)
     endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_BACKEND_FLAGS}")
     if (${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "Intel::SYCL_level_zero" OR ${COMPUTE_BACKEND_TARGET_NAME} STREQUAL "ze_loader")
-        set(MULTI_GPU_SUPPORT ON PARENT_SCOPE)
-        set(MULTI_GPU_SUPPORT ON)
+        set(CCL_ENABLE_ZE ON PARENT_SCOPE)
+        set(CCL_ENABLE_ZE ON)
     endif()
-    if (MULTI_GPU_SUPPORT)
-        message(STATUS "Enable GPU support using level-zero")
+    if (CCL_ENABLE_ZE)
+        message(STATUS "Enable Level Zero support")
     endif()
 
     # need to pass these variables to overlying function
diff --git a/cmake/templates/oneCCLConfig.cmake.in b/cmake/templates/oneCCLConfig.cmake.in
index 86b7de9f8..0decd0c19 100644
--- a/cmake/templates/oneCCLConfig.cmake.in
+++ b/cmake/templates/oneCCLConfig.cmake.in
@@ -23,11 +23,11 @@ if (EXISTS "${CCL_CONFIGURATION}")
     set(_oneccl_subdir "${CCL_CONFIGURATION}")
 endif()
 
-if (_oneccl_subdir EQUAL "cpu_icc")
+if (_oneccl_subdir EQUAL "cpu")
     include(CheckCXXCompilerFlag)
     check_cxx_compiler_flag("-fsycl" _fsycl_option)
     if (_fsycl_option)
-        message(STATUS "STATUS: -fsycl not supported for CCL_CONFIGURATION=cpu_icc")
+        message(STATUS "STATUS: -fsycl not supported for CCL_CONFIGURATION=cpu")
     endif()
 endif()
 
diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy
index 6a2e27a5b..665e28ffd 100755
Binary files a/deps/mpi/bin/hydra_bstrap_proxy and b/deps/mpi/bin/hydra_bstrap_proxy differ
diff --git a/deps/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver
index 3af2dc9bc..5d91b8cae 100755
Binary files a/deps/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ
diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy
index 6e09d880f..644a5a5e7 100755
Binary files a/deps/mpi/bin/hydra_pmi_proxy and b/deps/mpi/bin/hydra_pmi_proxy differ
diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec
index 61a4ff30a..2fca15c37 100755
Binary files a/deps/mpi/bin/mpiexec and b/deps/mpi/bin/mpiexec differ
diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra
index 61a4ff30a..2fca15c37 100755
Binary files a/deps/mpi/bin/mpiexec.hydra and b/deps/mpi/bin/mpiexec.hydra differ
diff --git a/deps/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc
index c54304f11..338750e29 100755
--- a/deps/mpi/bin/mpigcc
+++ b/deps/mpi/bin/mpigcc
@@ -99,12 +99,8 @@ fi
 # Determined by a combination of environment variables and tests within
 # configure (e.g., determining whehter -lsocket is needee)
 CC="gcc"
-MPICH_VERSION="3.3"
-CFLAGS=""
-CPPFLAGS=""
-LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
-LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.4"
+MPICH_VERSION="3.4a2"
+MPIVERSION="2021.5"
 MPILIBNAME="mpi"                           
 
 
@@ -594,10 +590,6 @@ fi
 final_cppflags=" "
 final_ldflags="  -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 final_libs="-lpthread -lrt "
-if test "no" = "no" -o "${interlib_deps}" = "no" ; then
-    final_ldflags="${final_ldflags}  -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl -L/p/pdsd/scratch/jenkins/artefacts_impi_2019/hcoll/lib -L/p/pdsd/scratch/Uploads/IMPI/other/software/libfabric/linux/v1.9.0/lib"
-    final_libs="${final_libs}  -lm   -lpthread  -lfabric -lrt "
-fi
 
 # -----------------------------------------------------------------------
 #
@@ -622,7 +614,7 @@ if [ "$linking" = yes ] ; then
         $Show $CC ${final_cppflags} $PROFILE_INCPATHS ${final_cflags} ${final_ldflags} $allargs -I\"${includedir}\"
         rc=$?
     else
-        $Show $CC $CPPFLAGS $CFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $mpilibs $I_MPI_OTHERLIBS $LDFLAGS
+        $Show $CC $CPPFLAGS $CFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $mpilibs $I_MPI_OTHERLIBS ${final_ldflags}
         rc=$?
 
         if [ $rc -eq 0 -a "x$strip_debug_info" = "xyes" ] ; then
diff --git a/deps/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx
index b9382fd8c..65ac0f5d3 100755
--- a/deps/mpi/bin/mpigxx
+++ b/deps/mpi/bin/mpigxx
@@ -97,11 +97,8 @@ fi
 
 # Default settings for compiler, flags, and libraries
 CXX="g++"
-MPICH_VERSION="3.3"
-CXXFLAGS=""
-LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
-LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.4"
+MPICH_VERSION="3.4a2"
+MPIVERSION="2021.5"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
@@ -606,10 +603,6 @@ fi
 final_cppflags=" "
 final_ldflags="  -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 final_libs="-lpthread -lrt "
-if test "no" = "no" -o "${interlib_deps}" = "no" ; then
-    final_ldflags="${final_ldflags}  -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl -L/p/pdsd/scratch/jenkins/artefacts_impi_2019/hcoll/lib -L/p/pdsd/scratch/Uploads/IMPI/other/software/libfabric/linux/v1.9.0/lib"
-    final_libs="${final_libs}  -lm   -lpthread  -lfabric -lrt "
-fi
 
 # A temporary statement to invoke the compiler
 # Place the -L before any args incase there are any mpi libraries in there.
@@ -625,7 +618,7 @@ if [ "$linking" = yes ] ; then
         $Show $CXX ${final_cppflags} $PROFILE_INCPATHS ${final_cxxflags} ${final_ldflags} $allargs -I\"${includedir}\"
         rc=$?
     else
-        $Show $CXX $CXXFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $shllibpath $cxxlibs $mpilibs $I_MPI_OTHERLIBS $LDFLAGS
+        $Show $CXX $CXXFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $shllibpath $cxxlibs $mpilibs $I_MPI_OTHERLIBS ${final_ldflags}
         rc=$?
         if [ $rc -eq 0 -a "x$strip_debug_info" = "xyes" ] ; then
             $Show objcopy --only-keep-debug ${executable} ${executable}.dbg
diff --git a/deps/mpi/bin/mpiicc b/deps/mpi/bin/mpiicc
index 25c4dea5b..581e5c29a 100755
--- a/deps/mpi/bin/mpiicc
+++ b/deps/mpi/bin/mpiicc
@@ -106,7 +106,7 @@ LDFLAGS="-ldl"
 MPILIBNAME="mpi"
 
 # MPIVERSION is the version of the MPICH2 library that mpicc is intended for
-MPIVERSION="2021.4"
+MPIVERSION="2021.5"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/deps/mpi/bin/mpiicpc b/deps/mpi/bin/mpiicpc
index 1e221dbbd..e2377755f 100755
--- a/deps/mpi/bin/mpiicpc
+++ b/deps/mpi/bin/mpiicpc
@@ -107,7 +107,7 @@ MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
 # MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
-MPIVERSION="2021.4"
+MPIVERSION="2021.5"
 
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/deps/mpi/etc/tuning_clx-ap_ofi.dat b/deps/mpi/etc/tuning_clx-ap_ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat b/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_clx-ap_shm.dat b/deps/mpi/etc/tuning_clx-ap_shm.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_generic_ofi.dat b/deps/mpi/etc/tuning_generic_ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_generic_shm-ofi.dat b/deps/mpi/etc/tuning_generic_shm-ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_generic_shm.dat b/deps/mpi/etc/tuning_generic_shm.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_knl_ofi.dat b/deps/mpi/etc/tuning_knl_ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_knl_shm-ofi.dat b/deps/mpi/etc/tuning_knl_shm-ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_knl_shm.dat b/deps/mpi/etc/tuning_knl_shm.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_skx_ofi.dat b/deps/mpi/etc/tuning_skx_ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_skx_shm-ofi.dat b/deps/mpi/etc/tuning_skx_shm-ofi.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/etc/tuning_skx_shm.dat b/deps/mpi/etc/tuning_skx_shm.dat
old mode 100755
new mode 100644
diff --git a/deps/mpi/include/mpi.h b/deps/mpi/include/mpi.h
old mode 100755
new mode 100644
index 3dc48685b..adc1f2297
--- a/deps/mpi/include/mpi.h
+++ b/deps/mpi/include/mpi.h
@@ -580,8 +580,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.4.0"
-#define I_MPI_NUMVERSION 20210400300
+#define I_MPI_VERSION "2021.5.0"
+#define I_MPI_NUMVERSION 20210500300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
diff --git a/deps/mpi/include/mpicxx.h b/deps/mpi/include/mpicxx.h
old mode 100755
new mode 100644
diff --git a/deps/mpi/include/mpio.h b/deps/mpi/include/mpio.h
old mode 100755
new mode 100644
diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so
index 84631e5a7..5b05a5027 100755
Binary files a/deps/mpi/lib/libmpi.so and b/deps/mpi/lib/libmpi.so differ
diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12
index 84631e5a7..5b05a5027 100755
Binary files a/deps/mpi/lib/libmpi.so.12 and b/deps/mpi/lib/libmpi.so.12 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0
index 84631e5a7..5b05a5027 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0 and b/deps/mpi/lib/libmpi.so.12.0 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0
index 84631e5a7..5b05a5027 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so
index 00d80af4b..399678958 100755
Binary files a/deps/mpi/lib/libmpifort.so and b/deps/mpi/lib/libmpifort.so differ
diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12
index 00d80af4b..399678958 100755
Binary files a/deps/mpi/lib/libmpifort.so.12 and b/deps/mpi/lib/libmpifort.so.12 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0
index 00d80af4b..399678958 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0 and b/deps/mpi/lib/libmpifort.so.12.0 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0
index 00d80af4b..399678958 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/deps/mpi/licensing/license.txt b/deps/mpi/licensing/license.txt
index ffffdc860..f987e502b 100644
--- a/deps/mpi/licensing/license.txt
+++ b/deps/mpi/licensing/license.txt
@@ -1,77 +1,73 @@
-Intel Simplified Software License (Version February 2020)
+Intel Simplified Software License (Version August 2021)
 
-Use and Redistribution. You may use and redistribute the software (the 
+Use and Redistribution. You may use and redistribute the software (the
 "Software"), without modification, provided the following conditions are met:
 
-* Redistributions must reproduce the above copyright notice and the following 
-  terms of use in the Software and in the documentation and/or other materials 
+* Redistributions must reproduce the above copyright notice and the following
+  terms of use in the Software and in the documentation and/or other materials
   provided with the distribution.
 * Neither the name of Intel nor the names of its suppliers may be used to 
-  endorse or promote products derived from this Software without specific prior 
-  written permission.
-* No reverse engineering, decompilation, or disassembly of this Software is 
+  endorse or promote products derived from this Software without specific  
+  prior written permission.
+* No reverse engineering, decompilation, or disassembly of this Software is
   permitted.
 
-Limited patent license. Intel grants you a world-wide, royalty-free, 
-non-exclusive license under patents it now or hereafter owns or controls to 
-make, have made, use, import, offer to sell and sell ("Utilize") this Software, 
-but solely to the extent that any such patent is necessary to Utilize the 
-Software alone. The patent license shall not apply to any combinations which 
-include this software. No hardware per se is licensed hereunder.
+No other licenses. Except as provided in the preceding section, Intel grants no
+licenses or other rights by implication, estoppel or otherwise to, patent,
+copyright, trademark, trade name, service mark or other intellectual property
+licenses or rights of Intel.
 
-Third party programs. The Software may contain Third Party Programs. "Third 
-Party Programs" are third party software, open source software or other Intel 
-software listed in the "third-party-programs.txt"  or other similarly named text 
-file that is included with the Software. Third Party Programs, even if included 
-with the distribution of the Software, may be governed by separate license 
-terms, including without limitation, third party license terms, open source 
-software notices and terms, and/or other Intel software license terms. These 
-separate license terms may govern your use of the Third Party Programs.  
+Third party software. The Software may contain Third Party Software. "Third
+Party Software" is open source software, third party software, or other Intel
+software that may be identified in the Software itself or in the files (if any)
+listed in the "third-party-software.txt" or similarly named text file included
+with the Software. Third Party Software, even if included with the distribution
+of the Software, may be governed by separate license terms, including without
+limitation, open source software license terms, third party software license
+terms, and other Intel software license terms. Those separate license terms
+solely govern your use of the Third Party Software, and nothing in this license
+limits any rights under, or grants rights that supersede, the terms of the
+applicable license terms.
 
-DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE 
-DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS 
-WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE 
-THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND 
-ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT 
-INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS.
+DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE
+DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS
+WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE
+THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND
+ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT
+INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE SOFTWARE.
 
-LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, 
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD 
-INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR 
-UNAUTHORIZED USE OF THE SOFTWARE.
+LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-No support. Intel may make changes to the Software, at any time without notice, 
-and is not obligated to support, update or provide training for the Software. 
+No support. Intel may make changes to the Software, at any time without notice,
+and is not obligated to support, update or provide training for the Software.
 
-Termination. Intel may terminate your right to use the Software in the event of 
-your breach of this Agreement and you fail to cure the breach within a 
-reasonable period of time.
+Termination. Your right to use the Software is terminated in the event of your
+breach of this license.
 
-Feedback. Should you provide Intel with comments, modifications, corrections, 
-enhancements or other input ("Feedback") related to the Software Intel will be 
-free to use, disclose, reproduce, license or otherwise distribute or exploit the 
-Feedback in its sole discretion without any obligations or restrictions of any 
-kind, including without limitation, intellectual property rights or licensing 
+Feedback. Should you provide Intel with comments, modifications, corrections,
+enhancements or other input ("Feedback") related to the Software, Intel will be
+free to use, disclose, reproduce, license or otherwise distribute or exploit the
+Feedback in its sole discretion without any obligations or restrictions of any
+kind, including without limitation, intellectual property rights or licensing
 obligations.
 
-Compliance with laws. You agree to comply with all relevant laws and regulations 
-governing your use, transfer, import or export (or prohibition thereof) of the 
+Compliance with laws. You agree to comply with all relevant laws and regulations
+governing your use, transfer, import or export (or prohibition thereof) of the
 Software.
 
-Governing law. All disputes will be governed by the laws of the United States of 
-America and the State of Delaware without reference to conflict of law 
-principles and subject to the exclusive jurisdiction of the state or federal 
-courts sitting in the State of Delaware, and each party agrees that it submits 
-to the personal jurisdiction and venue of those courts and waives any 
-objections. The United Nations Convention on Contracts for the International 
-Sale of Goods (1980) is specifically excluded and will not apply to the 
+Governing law. All disputes will be governed by the laws of the United States of
+America and the State of Delaware without reference to conflict of law
+principles and subject to the exclusive jurisdiction of the state or federal
+courts sitting in the State of Delaware, and each party agrees that it submits
+to the personal jurisdiction and venue of those courts and waives any
+objections. The United Nations Convention on Contracts for the International
+Sale of Goods (1980) is specifically excluded and will not apply to the
 Software.
-
-*Other names and brands may be claimed as the property of others.
diff --git a/deps/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt
index f85123769..12d94d3db 100644
--- a/deps/mpi/licensing/third-party-programs.txt
+++ b/deps/mpi/licensing/third-party-programs.txt
@@ -1,4 +1,4 @@
-Intel(R) MPI Library 2021.4 Third Party Programs File
+Intel(R) MPI Library 2021.5 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -270,87 +270,82 @@ terms are listed below.
   
 -------------------------------------------------------------------------------  
 
-5. Intel® Distribution for Python
+5. Intel® Distribution for Python*
 
-   Intel Simplified Software License (Version February 2020)
+   Intel Simplified Software License (Version August 2021)
 
-  Use and Redistribution. You may use and redistribute the software (the 
+  Use and Redistribution. You may use and redistribute the software (the
   "Software"), without modification, provided the following conditions are met:
 
-  * Redistributions must reproduce the above copyright notice and the following 
-  terms of use in the Software and in the documentation and/or other materials 
-  provided with the distribution.
-  * Neither the name of Intel nor the names of its suppliers may be used to 
-  endorse or promote products derived from this Software without specific prior 
-  written permission.
-  * No reverse engineering, decompilation, or disassembly of this Software is 
-  permitted.
-
-  Limited patent license. Intel grants you a world-wide, royalty-free, 
-  non-exclusive license under patents it now or hereafter owns or controls to 
-  make, have made, use, import, offer to sell and sell ("Utilize") this Software, 
-  but solely to the extent that any such patent is necessary to Utilize the 
-  Software alone. The patent license shall not apply to any combinations which 
-  include this software. No hardware per se is licensed hereunder.
-
-  Third party programs. The Software may contain Third Party Programs. "Third 
-  Party Programs" are third party software, open source software or other Intel 
-  software listed in the "third-party-programs.txt"  or other similarly named text 
-  file that is included with the Software. Third Party Programs, even if included 
-  with the distribution of the Software, may be governed by separate license 
-  terms, including without limitation, third party license terms, open source 
-  software notices and terms, and/or other Intel software license terms. These 
-  separate license terms may govern your use of the Third Party Programs.
-
-  DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 
-  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE 
-  DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS 
-  WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE 
-  THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND 
-  ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT 
-  INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS.
-
-  LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, 
-  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
-  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
-  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
-  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD 
-  INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR 
-  UNAUTHORIZED USE OF THE SOFTWARE.
-
-  No support. Intel may make changes to the Software, at any time without notice, 
-  and is not obligated to support, update or provide training for the Software. 
-
-  Termination. Intel may terminate your right to use the Software in the event of 
-  your breach of this Agreement and you fail to cure the breach within a 
-  reasonable period of time.
-
-  Feedback. Should you provide Intel with comments, modifications, corrections, 
-  enhancements or other input ("Feedback") related to the Software Intel will be 
-  free to use, disclose, reproduce, license or otherwise distribute or exploit the 
-  Feedback in its sole discretion without any obligations or restrictions of any 
-  kind, including without limitation, intellectual property rights or licensing 
+ * Redistributions must reproduce the above copyright notice and the following
+   terms of use in the Software and in the documentation and/or other materials
+   provided with the distribution.
+ * Neither the name of Intel nor the names of its suppliers may be used to 
+   endorse or promote products derived from this Software without specific  
+   prior written permission.
+ * No reverse engineering, decompilation, or disassembly of this Software is
+   permitted.
+
+  No other licenses. Except as provided in the preceding section, Intel grants no
+  licenses or other rights by implication, estoppel or otherwise to, patent,
+  copyright, trademark, trade name, service mark or other intellectual property
+  licenses or rights of Intel.
+
+  Third party software. The Software may contain Third Party Software. "Third
+  Party Software" is open source software, third party software, or other Intel
+  software that may be identified in the Software itself or in the files (if any)
+  listed in the "third-party-software.txt" or similarly named text file included
+  with the Software. Third Party Software, even if included with the distribution
+  of the Software, may be governed by separate license terms, including without
+  limitation, open source software license terms, third party software license
+  terms, and other Intel software license terms. Those separate license terms
+  solely govern your use of the Third Party Software, and nothing in this license
+  limits any rights under, or grants rights that supersede, the terms of the
+  applicable license terms.
+
+  DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE
+  DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS
+  WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE
+  THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND
+  ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT
+  INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE SOFTWARE.
+
+  LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  No support. Intel may make changes to the Software, at any time without notice,
+  and is not obligated to support, update or provide training for the Software.
+
+  Termination. Your right to use the Software is terminated in the event of your
+  breach of this license.
+
+  Feedback. Should you provide Intel with comments, modifications, corrections,
+  enhancements or other input ("Feedback") related to the Software, Intel will be
+  free to use, disclose, reproduce, license or otherwise distribute or exploit the
+  Feedback in its sole discretion without any obligations or restrictions of any
+  kind, including without limitation, intellectual property rights or licensing
   obligations.
 
-  Compliance with laws. You agree to comply with all relevant laws and regulations 
-  governing your use, transfer, import or export (or prohibition thereof) of the 
+  Compliance with laws. You agree to comply with all relevant laws and regulations
+  governing your use, transfer, import or export (or prohibition thereof) of the
   Software.
 
-  Governing law. All disputes will be governed by the laws of the United States of 
-  America and the State of Delaware without reference to conflict of law 
-  principles and subject to the exclusive jurisdiction of the state or federal 
-  courts sitting in the State of Delaware, and each party agrees that it submits 
-  to the personal jurisdiction and venue of those courts and waives any 
-  objections. The United Nations Convention on Contracts for the International 
-  Sale of Goods (1980) is specifically excluded and will not apply to the 
+  Governing law. All disputes will be governed by the laws of the United States of
+  America and the State of Delaware without reference to conflict of law
+  principles and subject to the exclusive jurisdiction of the state or federal
+  courts sitting in the State of Delaware, and each party agrees that it submits
+  to the personal jurisdiction and venue of those courts and waives any
+  objections. The United Nations Convention on Contracts for the International
+  Sale of Goods (1980) is specifically excluded and will not apply to the
   Software.
-
-  *Other names and brands may be claimed as the property of others. 
-
-
+  
 -------------------------------------------------------------------------------
 
 6. uthash
@@ -481,42 +476,110 @@ terms are listed below.
 
 -------------------------------------------------------------------------------
 
-10.  PMIx
-    Copyright (c) 2019, PMIx
-    All rights reserved.
+10.  OpenPMIx
+   Most files in this release are marked with the copyrights of the
+organizations who have edited them.  The copyrights below are in no
+particular order and generally reflect members of the Open MPI core
+team who have contributed code that may or may not have been ported
+to PMIx. Per the terms of that LICENSE, we include the list here.
+The copyrights for code used under license from other parties
+are included in the corresponding files.
 
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
+Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
+                        University Research and Technology
+                        Corporation.  All rights reserved.
+Copyright (c) 2004-2010 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2004-2010 High Performance Computing Center Stuttgart,
+                        University of Stuttgart.  All rights reserved.
+Copyright (c) 2004-2008 The Regents of the University of California.
+                        All rights reserved.
+Copyright (c) 2006-2010 Los Alamos National Security, LLC.  All rights
+                        reserved.
+Copyright (c) 2006-2010 Cisco Systems, Inc.  All rights reserved.
+Copyright (c) 2006-2010 Voltaire, Inc. All rights reserved.
+Copyright (c) 2006-2011 Sandia National Laboratories. All rights reserved.
+Copyright (c) 2006-2010 Sun Microsystems, Inc.  All rights reserved.
+                        Use is subject to license terms.
+Copyright (c) 2006-2010 The University of Houston. All rights reserved.
+Copyright (c) 2006-2009 Myricom, Inc.  All rights reserved.
+Copyright (c) 2007-2008 UT-Battelle, LLC. All rights reserved.
+Copyright (c) 2007-2019 IBM Corporation.  All rights reserved.
+Copyright (c) 1998-2005 Forschungszentrum Juelich, Juelich Supercomputing
+                        Centre, Federal Republic of Germany
+Copyright (c) 2005-2008 ZIH, TU Dresden, Federal Republic of Germany
+Copyright (c) 2007      Evergrid, Inc. All rights reserved.
+Copyright (c) 2008      Chelsio, Inc.  All rights reserved.
+Copyright (c) 2008-2009 Institut National de Recherche en
+                        Informatique.  All rights reserved.
+Copyright (c) 2007      Lawrence Livermore National Security, LLC.
+                        All rights reserved.
+Copyright (c) 2007-2019 Mellanox Technologies.  All rights reserved.
+Copyright (c) 2006-2010 QLogic Corporation.  All rights reserved.
+Copyright (c) 2008-2010 Oak Ridge National Labs.  All rights reserved.
+Copyright (c) 2006-2010 Oracle and/or its affiliates.  All rights reserved.
+Copyright (c) 2009      Bull SAS.  All rights reserved.
+Copyright (c) 2010      ARM ltd.  All rights reserved.
+Copyright (c) 2010-2011 Alex Brick <bricka@ccs.neu.edu>.  All rights reserved.
+Copyright (c) 2012      The University of Wisconsin-La Crosse. All rights
+                        reserved.
+Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
+Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
+Copyright (c) 2019      Amazon.com, Inc. or its affiliates.  All Rights
+                        reserved.
 
-  1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
-
-  2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
-  3. Neither the name of the copyright holder nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+The following LICENSE pertains to both PMIx and any code ported
+from Open MPI.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -------------------------------------------------------------------------------
  
   The following third party programs have their own third party programs. These
   additional third party program files are as follows:
-  1. Intel(R) MPI Benchmarks https://raw.githubusercontent.com/intel/mpi-benchmarks/master/license/third-party-programs.txt
-  2. Intel(R) Distribution for Python: third-party-programs-python.txt file
+  1. Intel(R) MPI Benchmarks <install_dir>/mpi/latest/benchmarks/imb/license/third-party-programs.txt
+  2. Intel(R) Distribution for Python* <install_dir>/intelpython/latest/licensing/third-party-programs.txt
   
 -------------------------------------------------------------------------------
   
-Other names and brands may be claimed as the property of others.
\ No newline at end of file
+* Other names and brands may be claimed as the property of others.
\ No newline at end of file
diff --git a/deps/ofi/bin/fi_info b/deps/ofi/bin/fi_info
index b4df1a8e6..711ae57b3 100755
Binary files a/deps/ofi/bin/fi_info and b/deps/ofi/bin/fi_info differ
diff --git a/deps/ofi/include/rdma/fabric.h b/deps/ofi/include/rdma/fabric.h
index cdfa11e8d..21bffa1d6 100644
--- a/deps/ofi/include/rdma/fabric.h
+++ b/deps/ofi/include/rdma/fabric.h
@@ -80,7 +80,7 @@ extern "C" {
 
 #define FI_MAJOR_VERSION 1
 #define FI_MINOR_VERSION 13
-#define FI_REVISION_VERSION 0
+#define FI_REVISION_VERSION 2
 
 enum {
 	FI_PATH_MAX		= 256,
diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so
index da151da5d..cf435ab98 100755
Binary files a/deps/ofi/lib/libfabric.so and b/deps/ofi/lib/libfabric.so differ
diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1
index da151da5d..cf435ab98 100755
Binary files a/deps/ofi/lib/libfabric.so.1 and b/deps/ofi/lib/libfabric.so.1 differ
diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so
index fab9b8d4e..875cedbc8 100755
Binary files a/deps/ofi/lib/prov/libpsm3-fi.so and b/deps/ofi/lib/prov/libpsm3-fi.so differ
diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so
index 28235ef3a..edb03e004 100755
Binary files a/deps/ofi/lib/prov/libpsmx2-fi.so and b/deps/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so
index 99a542183..211edb301 100755
Binary files a/deps/ofi/lib/prov/librxm-fi.so and b/deps/ofi/lib/prov/librxm-fi.so differ
diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so
index 73ec980df..9394b7be7 100755
Binary files a/deps/ofi/lib/prov/libshm-fi.so and b/deps/ofi/lib/prov/libshm-fi.so differ
diff --git a/deps/ofi/lib/prov/libsockets-fi.so b/deps/ofi/lib/prov/libsockets-fi.so
index 83d743b77..7145739c7 100755
Binary files a/deps/ofi/lib/prov/libsockets-fi.so and b/deps/ofi/lib/prov/libsockets-fi.so differ
diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so
index 89b2c7a01..6861c2533 100755
Binary files a/deps/ofi/lib/prov/libtcp-fi.so and b/deps/ofi/lib/prov/libtcp-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.1-fi.so b/deps/ofi/lib/prov/libverbs-1.1-fi.so
new file mode 100755
index 000000000..14f00726c
Binary files /dev/null and b/deps/ofi/lib/prov/libverbs-1.1-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.12-fi.so b/deps/ofi/lib/prov/libverbs-1.12-fi.so
new file mode 100755
index 000000000..1998f3ecf
Binary files /dev/null and b/deps/ofi/lib/prov/libverbs-1.12-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-fi.so b/deps/ofi/lib/prov/libverbs-fi.so
deleted file mode 100755
index 91c41bce2..000000000
Binary files a/deps/ofi/lib/prov/libverbs-fi.so and /dev/null differ
diff --git a/doc/rst/source/advanced-configuration/dmabuf.rst b/doc/rst/source/advanced-configuration/dmabuf.rst
index 4201d2704..694892495 100644
--- a/doc/rst/source/advanced-configuration/dmabuf.rst
+++ b/doc/rst/source/advanced-configuration/dmabuf.rst
@@ -1,12 +1,11 @@
-.. _`here`: https://github.com/ofiwg/libfabric/releases/tag/v1.13.1
+.. _`here`: https://github.com/ofiwg/libfabric/releases/tag/v1.13.2
 .. _`documentation`: https://one-api.gitlab-pages.devtools.intel.com/level_zero/core/PROG.html#affinity-mask
 
 =====================================
 Enabling OFI/verbs dmabuf support
 =====================================
 
-|product_short| provides experimental support for device memory transfers using Linux dmabuf,
-which is exposed through OFI API for verbs provider.
+|product_short| provides experimental support for data transfers between Intel GPU memory and NIC using Linux dmabuf, which is exposed through OFI API for verbs provider.
 
 
 Requirements
@@ -17,12 +16,12 @@ Requirements
 - level-zero-devel package
 
 
-Limitations
-###########
+Usage
+#####
 
-- Only first tile should be used from each GPU card.
-  For example, if GPU with 2 tiles is used then set ZE_AFFINITY_MASK=0.0.
-  More information about GPU selection can be found in level-zero `documentation`_.
+|product_short|, OFI and OFI/verbs from |base_tk| support device memory transfers. Refer to `Run instructions`__ for usage.
+
+If you want to build software components from sources, refer to `Build instructions`__.
 
 
 Build instructions
@@ -33,10 +32,10 @@ OFI
 
 ::
 
-    git clone --single-branch --branch v1.13.1 https://github.com/ofiwg/libfabric.git
+    git clone --single-branch --branch v1.13.2 https://github.com/ofiwg/libfabric.git
     cd libfabric
     ./autogen.sh
-    ./configure --prefix=<ofi_install_dir> --enable-verbs=<rdma_core_install_dir> --enable-ze-dlopen=yes
+    ./configure --prefix=<ofi_install_dir> --enable-verbs=<rdma_core_install_dir> --with-ze=<level_zero_install_dir> --enable-ze-dlopen=yes
     make -j install
 
 .. note::
@@ -48,17 +47,34 @@ OFI
 
 ::
 
-    cmake -DCMAKE_INSTALL_PREFIX=<ccl_install_dir> -DLIBFABRIC_DIR=<ofi_install_dir> -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp_level_zero -DENABLE_OFI_HMEM=1 ..
+    cmake -DCMAKE_INSTALL_PREFIX=<ccl_install_dir> -DLIBFABRIC_DIR=<ofi_install_dir> -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp_level_zero -DENABLE_OFI_HMEM=1 ..
     make -j install
 
 
 Run instructions
 ################
 
-Run allreduce test with ring algorithm and SYCL USM device buffers.
+1. Set the environment.
 
-::
+   If |base_tk| is used:
+
+   ::
+
+       source <toolkit_install_dir>/setvars.sh
+
+   If software components are built from sources:
+
+   ::
+
+       source <ccl_install_dir>/env/setvars.sh
+       export LD_LIBRARY_PATH=<ofi_install_path>/lib:${LD_LIBRARY_PATH}
+
+2. Run allreduce test with ring algorithm and SYCL USM device buffers:
+
+   ::
 
-    source <ccl_install_dir>/env/setvars.sh
-    export LD_LIBRARY_PATH=<ofi_install_path>/lib:${LD_LIBRARY_PATH}
-    CCL_ATL_TRANSPORT=ofi CCL_ATL_HMEM=1 CCL_ALLREDUCE=ring FI_PROVIDER=verbs mpiexec -n 2 <ccl_install_dir>/examples/sycl/sycl_allreduce_usm_test gpu device
+       export CCL_ATL_TRANSPORT=ofi
+       export CCL_ATL_HMEM=1
+       export CCL_ALLREDUCE=ring
+       export FI_PROVIDER=verbs
+       mpiexec -n 2 <ccl_install_dir>/examples/sycl/sycl_allreduce_usm_test gpu device
diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst
index 393839863..fa48e4dde 100644
--- a/doc/rst/source/env-variables.rst
+++ b/doc/rst/source/env-variables.rst
@@ -86,14 +86,12 @@ Available algorithms for each collective operation (``<algo_name>``):
      - Based on ``MPI_Iallreduce``
    * - ``rabenseifner``
      - Rabenseifner’s algorithm
-   * - ``starlike``
+   * - ``nreduce``
      - May be beneficial for imbalanced workloads
    * - ``ring`` 
      - reduce_scatter + allgather ring.
        Use ``CCL_RS_CHUNK_COUNT`` and ``CCL_RS_MIN_CHUNK_SIZE``
        to control pipelining on reduce_scatter phase.
-   * - ``ring_rma``
-     - reduce_scatter+allgather ring using RMA communications
    * - ``double_tree``
      - Double-tree algorithm
    * - ``recursive_doubling``
@@ -713,3 +711,32 @@ CCL_MNIC_COUNT
 
 Set this environment variable to specify the maximum number of NICs to be selected.
 The actual number of NICs selected may be smaller due to limitations on transport level or system configuration.
+
+
+CCL_SYCL_OUTPUT_EVENT
+#####################
+**Syntax**
+
+::
+
+  CCL_SYCL_OUTPUT_EVENT=<value>
+
+**Arguments**
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``1``
+     - Enable support for SYCL output event.
+   * - ``0``
+     - Disable support for SYCL output event (**default**).
+
+**Description**
+
+Set this environment variable to control support for SYCL output event.
+Once the support is enabled, you can retrieve SYCL output event from oneCCL event using ``get_native()`` method.
+oneCCL event must be associated with oneCCL communication operation.
diff --git a/doc/rst/source/introduction/installation.rst b/doc/rst/source/introduction/installation.rst
index a3d905dd7..1731f8b3f 100644
--- a/doc/rst/source/introduction/installation.rst
+++ b/doc/rst/source/introduction/installation.rst
@@ -78,7 +78,7 @@ You can customize CLI-based installation (for example, specify directory, compil
 
   ::
 
-     cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp_level_zero
+     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_BACKEND=dpcpp_level_zero
 
 * To specify the **build type**, modify the ``cmake`` command:
 
@@ -104,11 +104,11 @@ There are two ways to set up the environment:
 
     .. prompt:: bash
 
-        source <ccl_install_dir>/setvars.sh
+        source <ccl_install_dir>/env/setvars.sh
 
 
 - Using |product_short| from |base_tk| installed into ``<toolkit_install_dir>`` (``/opt/intel/inteloneapi`` by default):
 
     .. prompt:: bash
 
-        source <toolkit_install_dir>/setvars.sh
\ No newline at end of file
+        source <toolkit_install_dir>/setvars.sh
diff --git a/doc/rst/source/introduction/sample.rst b/doc/rst/source/introduction/sample.rst
index ed2c4eb0b..d97c10053 100644
--- a/doc/rst/source/introduction/sample.rst
+++ b/doc/rst/source/introduction/sample.rst
@@ -24,11 +24,11 @@ Build details
 
 #. :ref:`Set up <prerequisites>` the library environment.
 
-#. Use ``clang++`` compiler to build the sample:
+#. Use ``dpcpp`` compiler to build the sample:
 
    ::
 
-      clang++ -I${CCL_ROOT}/include -L${CCL_ROOT}/lib/ -lsycl -lccl -o sample sample.cpp
+      dpcpp -I${CCL_ROOT}/examples/include -I${CCL_ROOT}/include/ -L${CCL_ROOT}/lib/ -lccl -lmpi -o sample sample.cpp
 
 
 Run the sample
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a5186efaf..4e4a99104 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -30,7 +30,6 @@ endif()
 
 if (DEFINED ENV{I_MPI_ROOT})
     set(I_MPI_ROOT "$ENV{I_MPI_ROOT}")
-    set(CMAKE_INSTALL_RPATH "${I_MPI_ROOT}/lib/release_mt/")
 endif()
 
 message(STATUS "CCL_ROOT: ${CCL_ROOT}")
@@ -52,25 +51,28 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1)
 endif()
 
 #common release/debug compilation settings
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${C_COMPILER_FLAGS} -O3")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} ${C_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR(${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM"))
+if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp_level_zero")
     set(CMAKE_CLANG_FLAGS "-fsycl")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -lsycl")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CLANG_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_FLAGS}")
+endif()
+
+if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR(${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM"))
     # Use c++17 to be aligned with the compiler
     set(CMAKE_CXX_STANDARD 17)
 endif()
@@ -98,7 +100,7 @@ endif()
 include_directories(include)
 
 add_subdirectory(cpu)
-if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
+if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp_level_zero")
     add_subdirectory(sycl)
 endif()
 add_subdirectory(common)
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 2a6c4199d..4b5738642 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -42,7 +42,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark OPTIONAL)
 endforeach()
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index aafa9684a..5b98c7dae 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -274,12 +274,7 @@ int set_datatypes(std::string option_value,
                   std::list<std::string>& datatypes) {
     datatypes.clear();
     if (option_value == "all") {
-        if (is_check_values_enabled(check_values)) {
-            datatypes = tokenize<std::string>(ALL_DTYPES_LIST_WITH_CHECK, ',');
-        }
-        else {
-            datatypes = tokenize<std::string>(ALL_DTYPES_LIST, ',');
-        }
+        datatypes = tokenize<std::string>(ALL_DTYPES_LIST, ',');
     }
     else {
         datatypes = tokenize<std::string>(option_value, ',');
@@ -288,19 +283,12 @@ int set_datatypes(std::string option_value,
         std::set<std::string> supported_option_values;
 
         for (auto p : dtype_names) {
-            if ((p.first == ccl::datatype::float16 || p.first == ccl::datatype::bfloat16) &&
-                is_check_values_enabled(check_values))
-                continue;
             supported_option_values.insert(p.second);
         }
 
         for (auto dt : datatypes) {
             if (check_supported_options(option_name, dt, supported_option_values)) {
-                if ((dt == dtype_names[ccl::datatype::float16] ||
-                     dt == dtype_names[ccl::datatype::bfloat16]) &&
-                    is_check_values_enabled(check_values)) {
-                    PRINT("WARN: correctness checking is not implemented for '%s'", dt.c_str());
-                }
+                return -1;
             }
         }
     }
@@ -835,7 +823,7 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
 #endif
 
     PRINT_BY_ROOT(comm,
-                  "options:"
+                  "\noptions:"
                   "\n  processes:      %d"
                   "\n  backend:        %s"
                   "\n  loop:           %s"
diff --git a/examples/benchmark/include/coll.hpp b/examples/benchmark/include/coll.hpp
index 9a8c5d4c8..053b31721 100644
--- a/examples/benchmark/include/coll.hpp
+++ b/examples/benchmark/include/coll.hpp
@@ -26,6 +26,9 @@ using sycl_buffer_t = cl::sycl::buffer<Dtype, 1>;
 
 #define COLL_ROOT (0)
 
+#define BF16_COEF 0.00001
+#define FP16_COEF 0.0001
+
 struct base_coll;
 
 using coll_list_t = std::vector<std::shared_ptr<base_coll>>;
@@ -97,6 +100,21 @@ typedef struct bench_init_attr {
 #endif
 } bench_init_attr;
 
+template <class OutDtype, class InDtype = OutDtype>
+inline OutDtype get_val(InDtype value) {
+    return value;
+}
+
+template <>
+inline ccl::bfloat16 get_val<ccl::bfloat16, float>(float value) {
+    return fp32_to_bf16(BF16_COEF * value);
+}
+
+template <>
+inline ccl::float16 get_val<ccl::float16, float>(float value) {
+    return fp32_to_fp16(FP16_COEF * value);
+}
+
 /* base polymorph collective wrapper class */
 struct base_coll {
     base_coll(bench_init_attr init_attr) : init_attr(init_attr) {
@@ -116,6 +134,30 @@ struct base_coll {
         return nullptr;
     };
 
+#ifdef CCL_ENABLE_SYCL
+    template <class T, class vector_t = aligned_vector<T>>
+#else // CCL_ENABLE_SYCL
+    template <class T, class vector_t = std::vector<T>>
+#endif // CCL_ENABLE_SYCL
+    vector_t get_initial_values(size_t elem_count, int fill_value) {
+        vector_t res(elem_count);
+        ccl::datatype dt = ccl::native_type_info<typename std::remove_pointer<T>::type>::dtype;
+        if (dt == ccl::datatype::bfloat16) {
+            for (size_t elem_idx = 0; elem_idx < elem_count; elem_idx++) {
+                res[elem_idx] = fp32_to_bf16(BF16_COEF * fill_value).get_data();
+            }
+        }
+        else if (dt == ccl::datatype::float16) {
+            for (size_t elem_idx = 0; elem_idx < elem_count; elem_idx++) {
+                res[elem_idx] = fp32_to_fp16(FP16_COEF * fill_value).get_data();
+            }
+        }
+        else {
+            std::fill(res.begin(), res.end(), fill_value);
+        }
+        return res;
+    }
+
     virtual void prepare(size_t elem_count) {
         auto& transport = transport_data::instance();
         auto& comms = transport.get_comms();
@@ -128,10 +170,6 @@ struct base_coll {
     }
 
     virtual void finalize(size_t elem_count) {
-        auto dtype = get_dtype();
-        if (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16)
-            return;
-
         auto& transport = transport_data::instance();
         auto& comms = transport.get_comms();
         auto streams = transport.get_bench_streams();
diff --git a/examples/benchmark/include/config.hpp b/examples/benchmark/include/config.hpp
index fbd981fa7..e2db1e518 100644
--- a/examples/benchmark/include/config.hpp
+++ b/examples/benchmark/include/config.hpp
@@ -21,8 +21,7 @@
 
 #define ALL_COLLS_LIST "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce,reduce_scatter"
 
-#define ALL_DTYPES_LIST            "int8,int32,int64,uint64,float16,float32,float64,bfloat16"
-#define ALL_DTYPES_LIST_WITH_CHECK "int8,int32,int64,uint64,float32,float64"
+#define ALL_DTYPES_LIST "int8,int32,int64,uint64,float16,float32,float64,bfloat16"
 
 #define ALL_REDUCTIONS_LIST            "sum,prod,min,max"
 #define ALL_REDUCTIONS_LIST_WITH_CHECK "sum"
diff --git a/examples/benchmark/include/cpu_coll.hpp b/examples/benchmark/include/cpu_coll.hpp
index 4287bab01..38f873ee7 100644
--- a/examples/benchmark/include/cpu_coll.hpp
+++ b/examples/benchmark/include/cpu_coll.hpp
@@ -96,7 +96,7 @@ struct cpu_base_coll : base_coll, protected strategy {
                                   ccl::communicator& comm,
                                   ccl::stream& stream,
                                   size_t rank_idx) override {
-        int local_rank = comm.rank();
+        int comm_rank = comm.rank();
 
         size_t send_count = coll_strategy::get_send_multiplier() * elem_count;
         size_t recv_count = coll_strategy::get_recv_multiplier() * elem_count;
@@ -104,13 +104,12 @@ struct cpu_base_coll : base_coll, protected strategy {
         size_t send_bytes = send_count * base_coll::get_dtype_size();
         size_t recv_bytes = recv_count * base_coll::get_dtype_size();
 
-        std::vector<Dtype> fill_vector(send_count);
-        std::fill(fill_vector.begin(), fill_vector.end(), local_rank);
+        auto fill_vector = get_initial_values<Dtype>(send_count, comm_rank);
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             memcpy(send_bufs[b_idx][rank_idx], fill_vector.data(), send_bytes);
             if (!base_coll::get_inplace()) {
-                memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes);
+                memset(recv_bufs[b_idx][rank_idx], -1, recv_bytes);
             }
         }
     }
diff --git a/examples/benchmark/include/lp.hpp b/examples/benchmark/include/lp.hpp
new file mode 100644
index 000000000..5d278e521
--- /dev/null
+++ b/examples/benchmark/include/lp.hpp
@@ -0,0 +1,96 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/lp_types.hpp"
+
+ccl::float16 fp32_to_fp16(float val) {
+    uint32_t ans;
+    uint32_t* val_ptr = (reinterpret_cast<uint32_t*>(&val));
+    uint32_t exp_bits = (*val_ptr & 0x7F800000);
+    uint32_t significand_bits = (*val_ptr & 0x007FFFFF);
+    if (exp_bits == 0x00000000) {
+        ans = (*val_ptr & 0x80000000) >> 16;
+    }
+    else if (exp_bits == 0x7F800000) {
+        if (significand_bits != 0) {
+            ans = ((*val_ptr & 0x80000000) >> 16) | 0x00007C01;
+        }
+        else {
+            ans = ((*val_ptr & 0x80000000) >> 16) | 0x00007C00;
+        }
+    }
+    else if (exp_bits < 0x38800000) {
+        ans = 0xFC00;
+    }
+    else if (exp_bits > 0x47000000) {
+        ans = 0x7C00;
+    }
+    else {
+        ans = ((*val_ptr & 0x80000000) >> 16) | ((((*val_ptr & 0x7F800000) >> 23) - 112) << 10) |
+              ((*val_ptr & 0x007FFFFF) >> 13);
+    }
+    return ccl::float16(ans);
+}
+
+float fp16_to_fp32(ccl::float16 val) {
+    uint16_t val_data = val.get_data();
+    float ans = 0.0f;
+    uint32_t ans_bits = 0;
+    uint32_t exp_bits = val_data & 0x7C00;
+    uint32_t significand_bits = val_data & 0x03FF;
+    if (exp_bits == 0x7C00) {
+        ans_bits = ((val_data & 0x8000) << 16) | 0x7F800000 | (significand_bits << 13);
+    }
+    else if (exp_bits == 0x0000) {
+        if (significand_bits != 0x00000000) {
+            ans_bits = ((val_data & 0x8000) << 16);
+        }
+        else {
+            ans_bits = ((val_data & 0x8000) << 16) | (significand_bits << 13);
+        }
+    }
+    else {
+        ans_bits =
+            ((val_data & 0x8000) << 16) | ((exp_bits + 0x1C000) << 13) | (significand_bits << 13);
+    }
+    std::memcpy(reinterpret_cast<void*>(&ans), reinterpret_cast<void*>(&ans_bits), 4);
+    return ans;
+}
+
+ccl::bfloat16 fp32_to_bf16(float val) {
+    // Truncate
+    uint16_t int_val = 0;
+    memcpy(&int_val, reinterpret_cast<uint8_t*>(&val) + 2, 2);
+    return ccl::bfloat16(int_val);
+}
+
+float bf16_to_fp32(ccl::bfloat16 val) {
+    float ret = 0;
+    uint32_t temp = static_cast<uint32_t>(val.get_data()) << 16;
+    memcpy(&ret, &temp, sizeof(temp));
+    return ret;
+}
+
+std::ostream& operator<<(std::ostream& out, const ccl::float16& v) {
+    out << fp16_to_fp32(v) << "|" << v.get_data();
+    return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const ccl::bfloat16& v) {
+    out << bf16_to_fp32(v) << "|" << v.get_data();
+    return out;
+}
diff --git a/examples/benchmark/include/sycl_coll.hpp b/examples/benchmark/include/sycl_coll.hpp
index a605af700..26cec4117 100644
--- a/examples/benchmark/include/sycl_coll.hpp
+++ b/examples/benchmark/include/sycl_coll.hpp
@@ -24,7 +24,6 @@
 #include "sycl_base.hpp" /* from examples/include */
 
 #ifdef CCL_ENABLE_SYCL
-
 #include <CL/sycl.hpp>
 
 using namespace sycl;
@@ -159,7 +158,7 @@ struct sycl_base_coll : base_coll, private strategy {
         size_t send_bytes = send_count * base_coll::get_dtype_size();
         size_t recv_bytes = recv_count * base_coll::get_dtype_size();
 
-        std::fill(host_send_buf.begin(), host_send_buf.end(), comm_rank);
+        host_send_buf = get_initial_values<Dtype>(send_count, comm_rank);
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
@@ -168,7 +167,7 @@ struct sycl_base_coll : base_coll, private strategy {
                     .wait();
 
                 if (!base_coll::get_inplace()) {
-                    stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait();
+                    stream.get_native().memset(recv_bufs[b_idx][rank_idx], -1, recv_bytes).wait();
                 }
             }
             else {
@@ -188,7 +187,7 @@ struct sycl_base_coll : base_coll, private strategy {
                             (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
                         auto recv_buf_acc =
                             recv_buf->template get_access<mode::write>(h, recv_count);
-                        h.fill(recv_buf_acc, static_cast<Dtype>(0));
+                        h.fill(recv_buf_acc, static_cast<Dtype>(-1));
                     })
                     .wait();
             }
@@ -200,8 +199,8 @@ struct sycl_base_coll : base_coll, private strategy {
     }
 
     /* used on fill/check phases */
-    std::vector<Dtype> host_send_buf;
-    std::vector<Dtype> host_recv_buf;
+    aligned_vector<Dtype> host_send_buf;
+    aligned_vector<Dtype> host_recv_buf;
 
 private:
     std::vector<buf_allocator<Dtype>> allocators;
diff --git a/examples/benchmark/include/types.hpp b/examples/benchmark/include/types.hpp
index 2c12cdc67..6a90d037c 100644
--- a/examples/benchmark/include/types.hpp
+++ b/examples/benchmark/include/types.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
+#include "lp.hpp"
 
 #define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__);
 
@@ -174,16 +175,6 @@ typedef struct user_options_t {
     }
 } user_options_t;
 
-std::ostream& operator<<(std::ostream& out, const ccl::bfloat16& v) {
-    out << v.get_data();
-    return out;
-}
-
-std::ostream& operator<<(std::ostream& out, const ccl::float16& v) {
-    out << v.get_data();
-    return out;
-}
-
 template <class Dtype>
 ccl::datatype get_ccl_dtype() {
     return ccl::native_type_info<typename std::remove_pointer<Dtype>::type>::dtype;
diff --git a/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp b/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
index 795e3259a..cef8d7288 100644
--- a/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
+++ b/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
@@ -30,7 +30,7 @@ struct cpu_allgatherv_coll : cpu_base_coll<Dtype, allgatherv_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
@@ -44,7 +44,7 @@ struct cpu_allgatherv_coll : cpu_base_coll<Dtype, allgatherv_strategy_impl> {
             }
 
             for (int idx = 0; idx < comm.size(); idx++) {
-                Dtype rbuf_expected = idx;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(idx));
                 for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                     value = ((Dtype*)recv_bufs[b_idx][rank_idx])[idx * elem_count + e_idx];
                     if (value != rbuf_expected) {
diff --git a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
index 1d99ac63e..ddf40bd42 100644
--- a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
+++ b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
@@ -35,7 +35,7 @@ struct sycl_allgatherv_coll : sycl_base_coll<Dtype, allgatherv_strategy_impl> {
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
         int comm_size = comm.size();
-        Dtype sbuf_expected = comm.rank();
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
 
         size_t send_bytes = elem_count * base_coll::get_dtype_size();
         size_t recv_bytes = comm_size * elem_count * base_coll::get_dtype_size();
@@ -78,7 +78,7 @@ struct sycl_allgatherv_coll : sycl_base_coll<Dtype, allgatherv_strategy_impl> {
             }
 
             for (int idx = 0; idx < comm.size(); idx++) {
-                Dtype rbuf_expected = idx;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(idx));
                 for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                     value = host_recv_buf[idx * elem_count + e_idx];
                     if (value != rbuf_expected) {
diff --git a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
index a0d289aef..536ca34c5 100644
--- a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
@@ -30,9 +30,10 @@ struct cpu_allreduce_coll : cpu_base_coll<Dtype, allreduce_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
         /* TODO: handle PROD, MIN, MAX */
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
+
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
diff --git a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
index cd79face6..e0f0fa5d8 100644
--- a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
@@ -34,8 +34,8 @@ struct sycl_allreduce_coll : sycl_base_coll<Dtype, allreduce_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
 
         size_t send_bytes = elem_count * base_coll::get_dtype_size();
         size_t recv_bytes = elem_count * base_coll::get_dtype_size();
@@ -66,7 +66,6 @@ struct sycl_allreduce_coll : sycl_base_coll<Dtype, allreduce_strategy_impl> {
             }
 
             Dtype value;
-
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = host_send_buf[e_idx];
                 if (!base_coll::get_inplace() && (value != sbuf_expected)) {
diff --git a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
index 6e4458ca2..836ef7893 100644
--- a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
@@ -30,15 +30,14 @@ struct cpu_alltoall_coll : cpu_base_coll<Dtype, alltoall_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected;
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
         Dtype value;
         int comm_size = comm.size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
-                rbuf_expected = e_idx / elem_count;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
                 if (value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
diff --git a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
index 5d51be30e..96df23d27 100644
--- a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
@@ -34,7 +34,7 @@ struct sycl_alltoall_coll : sycl_base_coll<Dtype, alltoall_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
         int comm_size = comm.size();
 
         size_t send_bytes = comm_size * elem_count * base_coll::get_dtype_size();
@@ -69,7 +69,7 @@ struct sycl_alltoall_coll : sycl_base_coll<Dtype, alltoall_strategy_impl> {
 
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = host_send_buf[e_idx];
-                Dtype rbuf_expected = e_idx / elem_count;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
                 if (value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
diff --git a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
index 58eea5922..87f21b56a 100644
--- a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
@@ -30,14 +30,13 @@ struct cpu_alltoallv_coll : cpu_base_coll<Dtype, alltoallv_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected;
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
         Dtype value;
         int comm_size = comm.size();
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
-                rbuf_expected = e_idx / elem_count;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
                 if (value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
diff --git a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
index 4e1a31af2..6db2d0160 100644
--- a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
@@ -34,7 +34,7 @@ struct sycl_alltoallv_coll : sycl_base_coll<Dtype, alltoallv_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
         int comm_size = comm.size();
 
         size_t send_bytes = comm_size * elem_count * base_coll::get_dtype_size();
@@ -69,7 +69,7 @@ struct sycl_alltoallv_coll : sycl_base_coll<Dtype, alltoallv_strategy_impl> {
 
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
                 value = host_send_buf[e_idx];
-                Dtype rbuf_expected = e_idx / elem_count;
+                Dtype rbuf_expected = get_val<Dtype>(static_cast<float>(e_idx / elem_count));
                 if (value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
diff --git a/examples/benchmark/src/bcast/cpu_bcast_coll.hpp b/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
index dfd23566e..0d64d0cbb 100644
--- a/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
+++ b/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
@@ -31,10 +31,13 @@ struct cpu_bcast_coll : cpu_base_coll<Dtype, bcast_strategy_impl> {
                                   size_t rank_idx) override {
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                if (comm.rank() == COLL_ROOT)
-                    ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx] = b_idx;
-                else
+                if (comm.rank() == COLL_ROOT) {
+                    ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx] =
+                        get_val<Dtype>(static_cast<float>(b_idx));
+                }
+                else {
                     ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx] = 0;
+                }
             }
         }
     }
@@ -47,9 +50,10 @@ struct cpu_bcast_coll : cpu_base_coll<Dtype, bcast_strategy_impl> {
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
-                if (cast_to_size_t(value) != b_idx) {
+                Dtype expected = get_val<Dtype>(static_cast<float>(b_idx));
+                if (value != expected) {
                     std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
-                              << rank_idx << ", elem_idx " << e_idx << ", expected " << b_idx
+                              << rank_idx << ", elem_idx " << e_idx << ", expected " << expected
                               << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
diff --git a/examples/benchmark/src/bcast/sycl_bcast_coll.hpp b/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
index f0a06af50..cd9742ab9 100644
--- a/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
+++ b/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
@@ -38,7 +38,7 @@ struct sycl_bcast_coll : sycl_base_coll<Dtype, bcast_strategy_impl> {
         size_t bytes = count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            std::fill(host_recv_buf.begin(), host_recv_buf.end(), b_idx);
+            host_recv_buf = base_coll::get_initial_values<Dtype>(count, static_cast<int>(b_idx));
 
             if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
                 if (comm_rank == COLL_ROOT)
@@ -88,12 +88,12 @@ struct sycl_bcast_coll : sycl_base_coll<Dtype, bcast_strategy_impl> {
             }
 
             Dtype value;
-
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = host_recv_buf[e_idx];
-                if (value != static_cast<Dtype>(b_idx)) { // comparison float16 with size_t ??
+                Dtype expected = get_val<Dtype>(static_cast<float>(b_idx));
+                if (value != expected) {
                     std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
-                              << rank_idx << ", elem_idx " << e_idx << ", expected " << (Dtype)b_idx
+                              << rank_idx << ", elem_idx " << e_idx << ", expected " << expected
                               << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
diff --git a/examples/benchmark/src/reduce/cpu_reduce_coll.hpp b/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
index 0a0c9d445..197dc2b94 100644
--- a/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
+++ b/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
@@ -30,8 +30,8 @@ struct cpu_reduce_coll : cpu_base_coll<Dtype, reduce_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
diff --git a/examples/benchmark/src/reduce/sycl_reduce_coll.hpp b/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
index b9ac0ce95..47059af9e 100644
--- a/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
+++ b/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
@@ -34,8 +34,8 @@ struct sycl_reduce_coll : sycl_base_coll<Dtype, reduce_strategy_impl> {
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
 
         int comm_rank = comm.rank();
 
diff --git a/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
index f9bf0107a..ce1121c6b 100644
--- a/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
@@ -30,8 +30,8 @@ struct cpu_reduce_scatter_coll : cpu_base_coll<Dtype, reduce_scatter_strategy_im
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
         Dtype value;
 
         size_t recv_elem_count = elem_count / comm.size();
diff --git a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
index 0013ec1cb..d57b7126a 100644
--- a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
@@ -34,8 +34,8 @@ struct sycl_reduce_scatter_coll : sycl_base_coll<Dtype, reduce_scatter_strategy_
                                    ccl::communicator& comm,
                                    ccl::stream& stream,
                                    size_t rank_idx) override {
-        Dtype sbuf_expected = comm.rank();
-        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+        Dtype sbuf_expected = get_val<Dtype>(static_cast<float>(comm.rank()));
+        Dtype rbuf_expected = get_val<Dtype>((comm.size() - 1) * ((float)comm.size() / 2));
 
         size_t recv_elem_count = elem_count / comm.size();
 
diff --git a/examples/common/CMakeLists.txt b/examples/common/CMakeLists.txt
index 296a83adb..9edb0dd4e 100644
--- a/examples/common/CMakeLists.txt
+++ b/examples/common/CMakeLists.txt
@@ -27,7 +27,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common OPTIONAL)
 endforeach()
diff --git a/examples/cpu/CMakeLists.txt b/examples/cpu/CMakeLists.txt
index 77a2c0342..ac50a4e05 100644
--- a/examples/cpu/CMakeLists.txt
+++ b/examples/cpu/CMakeLists.txt
@@ -27,7 +27,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC dl)
     target_link_libraries(${executable} PUBLIC pthread)
     target_link_libraries(${executable} PUBLIC stdc++)
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu OPTIONAL)
 endforeach()
diff --git a/examples/cpu/communicator.cpp b/examples/cpu/communicator.cpp
index 44b7dc291..6d3b82cc5 100644
--- a/examples/cpu/communicator.cpp
+++ b/examples/cpu/communicator.cpp
@@ -184,11 +184,6 @@ void check_comm_split_identical_color(ccl::communicator& comm) {
 }
 
 int main() {
-    /**
-     * The example only works with CCL_ATL_TRANSPORT=ofi
-     */
-    setenv("CCL_ATL_TRANSPORT", "ofi", 0);
-
     ccl::init();
 
     int mpi_size, mpi_rank;
diff --git a/examples/cpu/external_kvs.cpp b/examples/cpu/external_kvs.cpp
index c22ca77f3..55a94696f 100644
--- a/examples/cpu/external_kvs.cpp
+++ b/examples/cpu/external_kvs.cpp
@@ -88,7 +88,7 @@ int main() {
         kvs = ccl::create_kvs(main_addr);
     }
 
-    auto ext_kvs = std::make_shared<external_kvs>(kvs);
+    auto ext_kvs = std::shared_ptr<external_kvs>(new external_kvs(kvs));
 
     auto comm = ccl::create_communicator(size, rank, ext_kvs);
     auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
diff --git a/examples/external_launcher/run_binary.sh b/examples/external_launcher/run_binary.sh
index 56e430d6c..732c15748 100755
--- a/examples/external_launcher/run_binary.sh
+++ b/examples/external_launcher/run_binary.sh
@@ -147,10 +147,10 @@ function run()
     elif [[ $CCL_VARS == *"vars.sh"* ]];
     then
         echo "Use oneAPI CCL variables script"
-        source ${MPI_VARS} -i_mpi_library_kind=release_mt
+        source ${MPI_VARS}
     fi
 
-    export CCL_CONFIGURATION="cpu_icc"
+    export CCL_CONFIGURATION="cpu"
     source ${CCL_VARS} --ccl-configuration="${CCL_CONFIGURATION}"
 
     eval `echo $binary_env $binary_path $binary_arg ;` &> $LOG_FILE
diff --git a/examples/include/base_utils.hpp b/examples/include/base_utils.hpp
index 5dd68a1dc..ca6c8e370 100644
--- a/examples/include/base_utils.hpp
+++ b/examples/include/base_utils.hpp
@@ -16,10 +16,13 @@
 #pragma once
 
 #include <algorithm>
+#include <cstdlib>
 #include <iterator>
+#include <new>
 #include <sstream>
 #include <tuple>
 #include <utility>
+#include <vector>
 
 template <int CurIndex, class T, class U, class... Args>
 struct get_tuple_elem_index {
@@ -109,6 +112,38 @@ void ccl_tuple_for_each_indexed(functor f, const FunctionArgs&... args) {
         f, is_tuple_finished_t{}, args...);
 }
 
+template <class T, size_t align>
+struct aligned_allocator {
+    using value_type = T;
+    using pointer = T*;
+
+    template <class U>
+    struct rebind {
+        using other = aligned_allocator<U, align>;
+    };
+
+    aligned_allocator() = default;
+    ~aligned_allocator() = default;
+
+    template <class U, size_t Ualign>
+    constexpr aligned_allocator(const aligned_allocator<U, Ualign>&) noexcept {}
+
+    inline pointer allocate(size_t n) {
+        void* ptr = aligned_alloc(align, sizeof(value_type) * n);
+        if (!ptr) {
+            throw std::bad_alloc();
+        }
+        return reinterpret_cast<pointer>(ptr);
+    }
+
+    inline void deallocate(pointer ptr, size_t size) noexcept {
+        free(ptr);
+    }
+};
+
+template <class T, size_t align = 4 * 1024>
+using aligned_vector = std::vector<T, aligned_allocator<T, align>>;
+
 namespace utils {
 
 template <typename T>
diff --git a/examples/include/bf16.hpp b/examples/include/bf16.hpp
index 3b8039d24..c05bba421 100644
--- a/examples/include/bf16.hpp
+++ b/examples/include/bf16.hpp
@@ -45,8 +45,8 @@ int is_bf16_enabled() {
     __asm__ __volatile__("cpuid"
                          : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                          : "a"(7), "c"(0));
-    is_avx512f_enabled =
-        ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31);
+    is_avx512f_enabled = ((reg[1] & (1u << 16)) >> 16) & ((reg[1] & (1u << 30)) >> 30) &
+                         ((reg[1] & (1u << 31)) >> 31);
 
     return (is_avx512f_enabled) ? 1 : 0;
 #else
diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp
index f944018f2..12948129f 100644
--- a/examples/include/sycl_base.hpp
+++ b/examples/include/sycl_base.hpp
@@ -74,22 +74,19 @@ inline bool check_sycl_usm(queue& q, usm::alloc alloc_type) {
 }
 
 inline std::string get_preferred_gpu_platform_name() {
-    std::string filter;
     std::string result;
 
-    if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-        filter = "level-zero";
-    }
-    else if (getenv("SYCL_DEVICE_FILTER") != nullptr) {
-        if (std::strstr(getenv("SYCL_DEVICE_FILTER"), "level_zero") != NULL) {
+    std::string filter = "level-zero";
+    char* env = getenv("SYCL_DEVICE_FILTER");
+    if (env) {
+        if (std::strstr(env, "level_zero")) {
             filter = "level-zero";
         }
-        else if (std::strstr(getenv("SYCL_DEVICE_FILTER"), "opencl") != NULL) {
+        else if (std::strstr(env, "opencl")) {
             filter = "opencl";
         }
         else {
-            throw std::runtime_error("invalid device filter: " +
-                                     std::string(getenv("SYCL_DEVICE_FILTER")));
+            throw std::runtime_error("invalid device filter: " + std::string(env));
         }
     }
 
@@ -131,31 +128,27 @@ inline std::string get_preferred_gpu_platform_name() {
 }
 
 inline std::vector<sycl::device> create_sycl_gpu_devices() {
-    constexpr char dev_prefix[] = "-- ";
-    constexpr char sub_dev_prefix[] = "---- ";
+    constexpr char prefix[] = "-- ";
 
     std::vector<sycl::device> result;
     auto plaform_list = sycl::platform::get_platforms();
     auto preferred_platform_name = get_preferred_gpu_platform_name();
 
     std::stringstream ss;
-    ss << "preferred platform: [" << preferred_platform_name << "]\n";
+    std::stringstream ss_warn;
 
     for (const auto& platform : plaform_list) {
         auto platform_name = platform.get_info<sycl::info::platform::name>();
-
-        if (platform_name.compare(preferred_platform_name) != 0)
+        if (platform_name.compare(preferred_platform_name) != 0) {
             continue;
-
-        ss << "platform: [" << platform_name << "]\n";
+        }
 
         auto device_list = platform.get_devices();
-
         for (const auto& device : device_list) {
             auto device_name = device.get_info<cl::sycl::info::device::name>();
 
             if (!device.is_gpu()) {
-                ss << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n";
+                ss_warn << prefix << "device [" << device_name << "] is not GPU, skipping\n";
                 continue;
             }
 
@@ -165,9 +158,9 @@ inline std::vector<sycl::device> create_sycl_gpu_devices() {
                           part_props.end(),
                           info::partition_property::partition_by_affinity_domain) ==
                 part_props.end()) {
-                ss << dev_prefix << "device [" << device_name
-                   << "] does not support partition by affinity domain"
-                   << ", use root device\n";
+                ss_warn << prefix << "device [" << device_name
+                        << "] does not support partition by affinity domain"
+                        << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
@@ -179,37 +172,32 @@ inline std::vector<sycl::device> create_sycl_gpu_devices() {
                           part_affinity_domains.end(),
                           info::partition_affinity_domain::next_partitionable) ==
                 part_affinity_domains.end()) {
-                ss << dev_prefix << "device [" << device_name
-                   << "] does not support next_partitionable affinity domain"
-                   << ", use root device\n";
+                ss_warn << prefix << "device [" << device_name
+                        << "] does not support next_partitionable affinity domain"
+                        << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
 
-            ss << dev_prefix << "device [" << device_name << "] should provide "
-               << device.template get_info<info::device::partition_max_sub_devices>()
-               << " sub-devices\n";
-
             auto sub_devices =
                 device.create_sub_devices<info::partition_property::partition_by_affinity_domain>(
                     info::partition_affinity_domain::next_partitionable);
 
+            size_t sub_devices_max =
+                device.template get_info<info::device::partition_max_sub_devices>();
+            if (sub_devices.size() != sub_devices_max) {
+                ss_warn << prefix << "device [" << device_name << "] expected " << sub_devices_max
+                        << " sub-devices, but got " << sub_devices.size();
+            }
+
             if (sub_devices.empty()) {
-                /* TODO: remove when SYCL/L0 sub-devices will be supported */
-                ss << dev_prefix << "device [" << device_name << "] does not provide sub-devices"
-                   << ", use root device\n";
+                ss_warn << prefix << "device [" << device_name << "] does not provide sub-devices"
+                        << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
 
-            ss << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size()
-               << " sub-devices\n";
             result.insert(result.end(), sub_devices.begin(), sub_devices.end());
-
-            for (size_t idx = 0; idx < sub_devices.size(); idx++) {
-                ss << sub_dev_prefix << "sub-device " << idx << ": ["
-                   << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n";
-            }
         }
     }
 
@@ -217,7 +205,9 @@ inline std::vector<sycl::device> create_sycl_gpu_devices() {
         throw std::runtime_error("no GPU devices found");
     }
 
-    ss << "found: " << result.size() << " GPU device(s)\n";
+    ss << "preferred platform: " << preferred_platform_name << ", found: " << result.size()
+       << " GPU device(s)\n";
+    ss << ss_warn.str();
     printf("%s", ss.str().c_str());
 
     return result;
@@ -442,95 +432,3 @@ struct buf_allocator {
     queue q;
     set<T*> memory_storage;
 };
-
-template <class data_native_type, usm::alloc... types>
-struct usm_polymorphic_allocator {
-    using native_type = data_native_type;
-    using allocator_types = tuple<usm_allocator<native_type, types>...>;
-    using integer_usm_type = typename underlying_type<usm::alloc>::type;
-    using self_t = usm_polymorphic_allocator<data_native_type, types...>;
-
-    usm_polymorphic_allocator(queue& q)
-            : allocators{ make_tuple(usm_allocator<native_type, types>(q)...) } {}
-
-    ~usm_polymorphic_allocator() {
-        for (auto& v : memory_storage) {
-            data_native_type* mem = v.first;
-            deallocate(mem, v.second.size, v.second.type);
-        }
-    }
-
-private:
-    struct alloc_info {
-        size_t size;
-        usm::alloc type;
-    };
-    map<data_native_type*, alloc_info> memory_storage;
-
-    struct alloc_impl {
-        alloc_impl(native_type** out_ptr, size_t count, usm::alloc type, self_t* parent)
-                : out_usm_memory_pointer(out_ptr),
-                  size(count),
-                  alloc_index(0),
-                  requested_alloc_type(type),
-                  owner(parent) {}
-
-        template <class specific_allocator>
-        void operator()(specific_allocator& al) {
-            if (alloc_index++ == static_cast<integer_usm_type>(requested_alloc_type)) {
-                *out_usm_memory_pointer = al.allocate(size);
-
-                alloc_info info{ size, requested_alloc_type };
-                owner->memory_storage.emplace(*out_usm_memory_pointer, info);
-            }
-        }
-        native_type** out_usm_memory_pointer;
-        size_t size{};
-        int alloc_index{};
-        usm::alloc requested_alloc_type;
-        self_t* owner;
-    };
-
-    struct dealloc_impl {
-        dealloc_impl(native_type** in_ptr, size_t count, usm::alloc type, self_t* parent)
-                : in_usm_memory_pointer(in_ptr),
-                  size(count),
-                  alloc_index(0),
-                  requested_alloc_type(type),
-                  owner(parent) {}
-
-        template <class specific_allocator>
-        void operator()(specific_allocator& al) {
-            if (alloc_index++ == static_cast<integer_usm_type>(requested_alloc_type)) {
-                auto it = owner->memory_storage.find(*in_usm_memory_pointer);
-                if (it == owner->memory_storage.end()) {
-                    throw std::runtime_error(string(__PRETTY_FUNCTION__) +
-                                             " - not owns memory object");
-                }
-
-                al.deallocate(*in_usm_memory_pointer, size);
-                *in_usm_memory_pointer = nullptr;
-
-                owner->memory_storage.erase(it);
-            }
-        }
-        native_type** in_usm_memory_pointer;
-        size_t size;
-        int alloc_index;
-        usm::alloc requested_alloc_type;
-        self_t* owner;
-    };
-
-public:
-    allocator_types allocators;
-
-    native_type* allocate(size_t size, usm::alloc type) {
-        native_type* ret = nullptr;
-        ccl_tuple_for_each(allocators, alloc_impl{ &ret, size, type, this });
-        return ret;
-    }
-
-    void deallocate(native_type* in_ptr, size_t size, usm::alloc type) {
-        ccl_tuple_for_each(allocators, dealloc_impl{ &in_ptr, size, type, this });
-    }
-};
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index ff4fc8b5b..7426280bc 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -27,7 +27,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PRIVATE ccl)
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
     target_link_libraries(${executable} PRIVATE ${COMPUTE_BACKEND_TARGET_NAME})
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl OPTIONAL)
diff --git a/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp b/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp
new file mode 100644
index 000000000..856c675d0
--- /dev/null
+++ b/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp
@@ -0,0 +1,128 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sycl_base.hpp"
+
+using namespace std;
+using namespace sycl;
+
+int main(int argc, char *argv[]) {
+    const size_t count = 10 * 1024 * 1024;
+
+    int size = 0;
+    int rank = 0;
+
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
+    queue q;
+    if (!create_sycl_queue(argc, argv, rank, q)) {
+        return -1;
+    }
+
+    buf_allocator<int> allocator(q);
+
+    auto usm_alloc_type = usm::alloc::shared;
+    if (argc > 2) {
+        usm_alloc_type = usm_alloc_type_from_string(argv[2]);
+    }
+
+    if (!check_sycl_usm(q, usm_alloc_type)) {
+        return -1;
+    }
+
+    /* create kvs */
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    /* create buffers */
+    auto recv_buf = allocator.allocate(count * size, usm_alloc_type);
+
+    buffer<int> expected_buf(count * size);
+    buffer<int> check_buf(count * size);
+    vector<size_t> recv_counts(size, count);
+
+    /* open buffers and modify them on the device side */
+    auto e = q.submit([&](auto &h) {
+        accessor expected_buf_acc(expected_buf, h, write_only);
+        h.parallel_for(count, [=](auto id) {
+            recv_buf[rank * count + id] = rank + 1;
+            for (int i = 0; i < size; i++) {
+                expected_buf_acc[i * count + id] = i + 1;
+            }
+        });
+    });
+
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
+
+    /* invoke allgatherv */
+    auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
+    ccl::allgatherv(recv_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait();
+
+    /* open recv_buf and check its correctness on the device side */
+    q.submit([&](auto &h) {
+        accessor expected_buf_acc(expected_buf, h, read_only);
+        accessor check_buf_acc(check_buf, h, write_only);
+        h.parallel_for(size * count, [=](auto id) {
+            if (recv_buf[id] != expected_buf_acc[id]) {
+                check_buf_acc[id] = -1;
+            }
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* print out the result of the test on the host side */
+    {
+        host_accessor check_buf_acc(check_buf, read_only);
+        size_t i;
+        for (i = 0; i < size * count; i++) {
+            if (check_buf_acc[i] == -1) {
+                cout << "FAILED\n";
+                break;
+            }
+        }
+        if (i == size * count) {
+            cout << "PASSED\n";
+        }
+    }
+
+    return 0;
+}
diff --git a/examples/sycl/sycl_allgatherv_test.cpp b/examples/sycl/sycl_allgatherv_test.cpp
index e176b240b..bb76dcddb 100644
--- a/examples/sycl/sycl_allgatherv_test.cpp
+++ b/examples/sycl/sycl_allgatherv_test.cpp
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
     if (!handle_exception(q))
         return -1;
 
-    /* invoke allagtherv */
+    /* invoke allgatherv */
     ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream).wait();
 
     /* open recv_buf and check its correctness on the device side */
diff --git a/examples/sycl/sycl_allgatherv_usm_test.cpp b/examples/sycl/sycl_allgatherv_usm_test.cpp
index 895bbd31b..59c96dff3 100644
--- a/examples/sycl/sycl_allgatherv_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_usm_test.cpp
@@ -82,7 +82,6 @@ int main(int argc, char *argv[]) {
         accessor expected_buf_acc(expected_buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
             send_buf[id] = rank + 1;
-            recv_buf[id] = -1;
             for (int i = 0; i < size; i++) {
                 expected_buf_acc[i * count + id] = i + 1;
             }
@@ -93,7 +92,7 @@ int main(int argc, char *argv[]) {
     vector<ccl::event> deps;
     deps.push_back(ccl::create_event(e));
 
-    /* invoke allagtherv */
+    /* invoke allgatherv */
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
     ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait();
 
@@ -105,6 +104,9 @@ int main(int argc, char *argv[]) {
             if (recv_buf[id] != expected_buf_acc[id]) {
                 check_buf_acc[id] = -1;
             }
+            else {
+                check_buf_acc[id] = 0;
+            }
         });
     });
 
diff --git a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
index ab2de50d6..6624b75fd 100644
--- a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
     auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
     ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
 
-    /* open recv_buf and check its correctness on the device side */
+    /* open buf and check its correctness on the device side */
     buffer<int> check_buf(count);
     q.submit([&](auto &h) {
         accessor check_buf_acc(check_buf, h, write_only);
diff --git a/examples/sycl/sycl_allreduce_usm_test.cpp b/examples/sycl/sycl_allreduce_usm_test.cpp
index 26f2ce4f4..9b27b5759 100644
--- a/examples/sycl/sycl_allreduce_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_usm_test.cpp
@@ -18,7 +18,7 @@
 using namespace std;
 using namespace sycl;
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
     const size_t count = 10 * 1024 * 1024;
 
     int size = 0;
@@ -54,10 +54,10 @@ int main(int argc, char *argv[]) {
     if (rank == 0) {
         kvs = ccl::create_main_kvs();
         main_addr = kvs->get_address();
-        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
     }
     else {
-        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
         kvs = ccl::create_kvs(main_addr);
     }
 
@@ -74,13 +74,18 @@ int main(int argc, char *argv[]) {
     auto recv_buf = allocator.allocate(count, usm_alloc_type);
 
     /* open buffers and modify them on the device side */
-    auto e = q.submit([&](auto &h) {
+    auto e = q.submit([&](auto& h) {
         h.parallel_for(count, [=](auto id) {
-            send_buf[id] = rank + 1;
+            send_buf[id] = rank + id + 1;
             recv_buf[id] = -1;
         });
     });
 
+    int check_sum = 0;
+    for (int i = 1; i <= size; ++i) {
+        check_sum += i;
+    }
+
     /* do not wait completion of kernel and provide it as dependency for operation */
     vector<ccl::event> deps;
     deps.push_back(ccl::create_event(e));
@@ -91,10 +96,10 @@ int main(int argc, char *argv[]) {
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count);
-    q.submit([&](auto &h) {
+    q.submit([&](auto& h) {
         accessor check_buf_acc(check_buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
-            if (recv_buf[id] != size * (size + 1) / 2) {
+            if (recv_buf[id] != static_cast<int>(check_sum + size * id)) {
                 check_buf_acc[id] = -1;
             }
         });
diff --git a/examples/sycl/sycl_alltoallv_test.cpp b/examples/sycl/sycl_alltoallv_test.cpp
index 2905235bc..be447e15f 100644
--- a/examples/sycl/sycl_alltoallv_test.cpp
+++ b/examples/sycl/sycl_alltoallv_test.cpp
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
     if (!handle_exception(q))
         return -1;
 
-    /* invoke alltoall */
+    /* invoke alltoallv */
     ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream).wait();
 
     /* open recv_buf and check its correctness on the device side */
diff --git a/examples/sycl/sycl_alltoallv_usm_test.cpp b/examples/sycl/sycl_alltoallv_usm_test.cpp
index 211b548d1..89b884f13 100644
--- a/examples/sycl/sycl_alltoallv_usm_test.cpp
+++ b/examples/sycl/sycl_alltoallv_usm_test.cpp
@@ -88,7 +88,7 @@ int main(int argc, char *argv[]) {
     vector<ccl::event> deps;
     deps.push_back(ccl::create_event(e));
 
-    /* invoke alltoall */
+    /* invoke alltoallv */
     auto attr = ccl::create_operation_attr<ccl::alltoallv_attr>();
     ccl::alltoallv(send_buf, send_counts, recv_buf, recv_counts, comm, stream, attr, deps).wait();
 
diff --git a/examples/sycl/sycl_reduce_inplace_usm_test.cpp b/examples/sycl/sycl_reduce_inplace_usm_test.cpp
new file mode 100644
index 000000000..c28770120
--- /dev/null
+++ b/examples/sycl/sycl_reduce_inplace_usm_test.cpp
@@ -0,0 +1,137 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sycl_base.hpp"
+
+using namespace std;
+using namespace sycl;
+
+int main(int argc, char* argv[]) {
+    const size_t count = 10 * 1024 * 1024;
+    int root_rank = 1;
+
+    int size = 0;
+    int rank = 0;
+
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
+    queue q;
+    if (!create_sycl_queue(argc, argv, rank, q)) {
+        return -1;
+    }
+
+    buf_allocator<int> allocator(q);
+
+    auto usm_alloc_type = usm::alloc::shared;
+    if (argc > 2) {
+        usm_alloc_type = usm_alloc_type_from_string(argv[2]);
+    }
+    if (argc > 3) {
+        root_rank = atoi(argv[3]);
+    }
+    if (rank == root_rank) {
+        printf("root rank: %d\n", root_rank);
+    }
+
+    if (!check_sycl_usm(q, usm_alloc_type)) {
+        return -1;
+    }
+
+    /* create kvs */
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    /* create buffers */
+    auto buf = allocator.allocate(count, usm_alloc_type);
+
+    /* open buffers and modify them on the device side */
+    auto e = q.submit([&](auto& h) {
+        h.parallel_for(count, [=](auto id) {
+            buf[id] = rank + id + 1;
+        });
+    });
+
+    int check_sum = 0;
+    for (int i = 1; i <= size; ++i) {
+        check_sum += i;
+    }
+
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
+
+    /* invoke reduce */
+    auto attr = ccl::create_operation_attr<ccl::reduce_attr>();
+    ccl::reduce(buf, buf, count, ccl::reduction::sum, root_rank, comm, stream, attr, deps).wait();
+
+    /* open buf and check its correctness on the device side */
+    buffer<int> check_buf(count);
+
+    q.submit([&](auto& h) {
+        accessor check_buf_acc(check_buf, h, write_only);
+        h.parallel_for(count, [=](auto id) {
+            int expected = (rank == root_rank) ? (check_sum + size * id) : (rank + id + 1);
+            if (buf[id] != expected) {
+                check_buf_acc[id] = -1;
+            }
+            else {
+                check_buf_acc[id] = 0;
+            }
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* print out the result of the test on the host side */
+    {
+        host_accessor check_buf_acc(check_buf, read_only);
+        size_t i;
+        for (i = 0; i < count; i++) {
+            if (check_buf_acc[i] == -1) {
+                cout << "FAILED\n";
+                break;
+            }
+        }
+        if (i == count) {
+            cout << "PASSED\n";
+        }
+    }
+
+    return 0;
+}
diff --git a/examples/sycl/sycl_reduce_scatter_test.cpp b/examples/sycl/sycl_reduce_scatter_test.cpp
new file mode 100644
index 000000000..e91df88f5
--- /dev/null
+++ b/examples/sycl/sycl_reduce_scatter_test.cpp
@@ -0,0 +1,128 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sycl_base.hpp"
+
+using namespace std;
+using namespace sycl;
+
+int main(int argc, char *argv[]) {
+    const size_t count = 10 * 1024 * 1024;
+
+    int size = 0;
+    int rank = 0;
+
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
+    queue q;
+    if (!create_sycl_queue(argc, argv, rank, q)) {
+        return -1;
+    }
+
+    /* create kvs */
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    /* create buffers */
+    buffer<int> send_buf(count * size);
+    buffer<int> expected_buf(count);
+    buffer<int> recv_buf(count);
+
+    {
+        /* open buffers and initialize them on the host side */
+        host_accessor send_buf_acc(send_buf, write_only);
+        host_accessor recv_buf_acc(recv_buf, write_only);
+        host_accessor expected_acc_buf(expected_buf, write_only);
+
+        for (size_t i = 0; i < count * size; i++) {
+            send_buf_acc[i] = rank;
+        }
+        for (size_t i = 0; i < count; i++) {
+            recv_buf_acc[i] = -1;
+        }
+
+        for (size_t i = 0; i < count; i++) {
+            expected_acc_buf[i] = size * (size + 1) / 2;
+        }
+    }
+
+    /* open send_buf and modify it on the device side */
+    q.submit([&](auto &h) {
+        accessor send_buf_acc(send_buf, h, write_only);
+        h.parallel_for(count * size, [=](auto id) {
+            send_buf_acc[id] += 1;
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* invoke reduce_scatter */
+    ccl::reduce_scatter(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream).wait();
+
+    /* open recv_buf and check its correctness on the device side */
+    q.submit([&](auto &h) {
+        accessor recv_buf_acc(recv_buf, h, write_only);
+        accessor expected_buf_acc(expected_buf, h, read_only);
+        h.parallel_for(count, [=](auto id) {
+            if (recv_buf_acc[id] != expected_buf_acc[id]) {
+                recv_buf_acc[id] = -1;
+            }
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* print out the result of the test on the host side */
+    {
+        host_accessor recv_buf_acc(recv_buf, read_only);
+        size_t i;
+        for (i = 0; i < count; i++) {
+            if (recv_buf_acc[i] == -1) {
+                cout << "FAILED\n";
+                break;
+            }
+        }
+        if (i == count) {
+            cout << "PASSED\n";
+        }
+    }
+
+    return 0;
+}
diff --git a/examples/sycl/sycl_reduce_scatter_usm_test.cpp b/examples/sycl/sycl_reduce_scatter_usm_test.cpp
new file mode 100644
index 000000000..fd953d314
--- /dev/null
+++ b/examples/sycl/sycl_reduce_scatter_usm_test.cpp
@@ -0,0 +1,130 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sycl_base.hpp"
+
+using namespace std;
+using namespace sycl;
+
+int main(int argc, char *argv[]) {
+    const size_t count = 10 * 1024 * 1024;
+
+    int size = 0;
+    int rank = 0;
+
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
+    queue q;
+    if (!create_sycl_queue(argc, argv, rank, q)) {
+        return -1;
+    }
+
+    buf_allocator<int> allocator(q);
+
+    auto usm_alloc_type = usm::alloc::shared;
+    if (argc > 2) {
+        usm_alloc_type = usm_alloc_type_from_string(argv[2]);
+    }
+
+    if (!check_sycl_usm(q, usm_alloc_type)) {
+        return -1;
+    }
+
+    /* create kvs */
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    /* create buffers */
+    auto send_buf = allocator.allocate(count * size, usm_alloc_type);
+    auto recv_buf = allocator.allocate(count, usm_alloc_type);
+
+    buffer<int> expected_buf(count);
+    buffer<int> check_buf(count);
+
+    /* open buffers and modify them on the device side */
+    auto e = q.submit([&](auto &h) {
+        accessor expected_buf_acc(expected_buf, h, write_only);
+        h.parallel_for(count, [=](auto id) {
+            recv_buf[id] = -1;
+            expected_buf_acc[id] = size * (size - 1) / 2;
+            for (int i = 0; i < size; i++) {
+                send_buf[i * count + id] = rank;
+            }
+        });
+    });
+
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
+
+    /* invoke reduce_scatter */
+    auto attr = ccl::create_operation_attr<ccl::reduce_scatter_attr>();
+    ccl::reduce_scatter(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps)
+        .wait();
+
+    /* open recv_buf and check its correctness on the device side */
+    q.submit([&](auto &h) {
+        accessor expected_buf_acc(expected_buf, h, read_only);
+        accessor check_buf_acc(check_buf, h, write_only);
+        h.parallel_for(count, [=](auto id) {
+            if (recv_buf[id] != expected_buf_acc[id]) {
+                check_buf_acc[id] = -1;
+            }
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* print out the result of the test on the host side */
+    {
+        host_accessor check_buf_acc(check_buf, read_only);
+        size_t i;
+        for (i = 0; i < count; i++) {
+            if (check_buf_acc[i] == -1) {
+                cout << "FAILED\n";
+                break;
+            }
+        }
+        if (i == count) {
+            cout << "PASSED\n";
+        }
+    }
+
+    return 0;
+}
diff --git a/examples/sycl/sycl_reduce_usm_test.cpp b/examples/sycl/sycl_reduce_usm_test.cpp
new file mode 100644
index 000000000..dc9dbd169
--- /dev/null
+++ b/examples/sycl/sycl_reduce_usm_test.cpp
@@ -0,0 +1,140 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sycl_base.hpp"
+
+using namespace std;
+using namespace sycl;
+
+int main(int argc, char* argv[]) {
+    const size_t count = 10 * 1024 * 1024;
+    int root_rank = 1;
+
+    int size = 0;
+    int rank = 0;
+
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
+    queue q;
+    if (!create_sycl_queue(argc, argv, rank, q)) {
+        return -1;
+    }
+
+    buf_allocator<int> allocator(q);
+
+    auto usm_alloc_type = usm::alloc::shared;
+    if (argc > 2) {
+        usm_alloc_type = usm_alloc_type_from_string(argv[2]);
+    }
+    if (argc > 3) {
+        root_rank = atoi(argv[3]);
+    }
+    if (rank == root_rank) {
+        printf("root rank: %d\n", root_rank);
+    }
+
+    if (!check_sycl_usm(q, usm_alloc_type)) {
+        return -1;
+    }
+
+    /* create kvs */
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    /* create communicator */
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+
+    /* create stream */
+    auto stream = ccl::create_stream(q);
+
+    /* create buffers */
+    auto send_buf = allocator.allocate(count, usm_alloc_type);
+    auto recv_buf = allocator.allocate(count, usm_alloc_type);
+
+    /* open buffers and modify them on the device side */
+    auto e = q.submit([&](auto& h) {
+        h.parallel_for(count, [=](auto id) {
+            send_buf[id] = rank + id + 1;
+            recv_buf[id] = -1;
+        });
+    });
+
+    int check_sum = 0;
+    for (int i = 1; i <= size; ++i) {
+        check_sum += i;
+    }
+
+    /* do not wait completion of kernel and provide it as dependency for operation */
+    vector<ccl::event> deps;
+    deps.push_back(ccl::create_event(e));
+
+    /* invoke reduce */
+    auto attr = ccl::create_operation_attr<ccl::reduce_attr>();
+    ccl::reduce(send_buf, recv_buf, count, ccl::reduction::sum, root_rank, comm, stream, attr, deps)
+        .wait();
+
+    /* open recv_buf and check its correctness on the device side */
+    buffer<int> check_buf(count);
+
+    q.submit([&](auto& h) {
+        accessor check_buf_acc(check_buf, h, write_only);
+        h.parallel_for(count, [=](auto id) {
+            int expected = (rank == root_rank) ? (check_sum + size * id) : -1;
+            if (recv_buf[id] != expected) {
+                check_buf_acc[id] = -1;
+            }
+            else {
+                check_buf_acc[id] = 0;
+            }
+        });
+    });
+
+    if (!handle_exception(q))
+        return -1;
+
+    /* print out the result of the test on the host side */
+    {
+        host_accessor check_buf_acc(check_buf, read_only);
+        size_t i;
+        for (i = 0; i < count; i++) {
+            if (check_buf_acc[i] == -1) {
+                cout << "FAILED\n";
+                break;
+            }
+        }
+        if (i == count) {
+            cout << "PASSED\n";
+        }
+    }
+
+    return 0;
+}
diff --git a/include/oneapi/ccl/coll_attr.hpp b/include/oneapi/ccl/coll_attr.hpp
index a99454024..43d68e9b8 100644
--- a/include/oneapi/ccl/coll_attr.hpp
+++ b/include/oneapi/ccl/coll_attr.hpp
@@ -333,7 +333,6 @@ class barrier_attr : public ccl_api_base_copyable<barrier_attr,
     barrier_attr(
         const typename detail::ccl_api_type_attr_traits<operation_attr_id,
                                                         operation_attr_id::version>::type& version);
-    ;
 };
 
 /**
@@ -513,7 +512,6 @@ class reduce_scatter_attr : public ccl_api_base_copyable<reduce_scatter_attr,
     reduce_scatter_attr(
         const typename detail::ccl_api_type_attr_traits<operation_attr_id,
                                                         operation_attr_id::version>::type& version);
-    ;
 };
 
 /**
@@ -574,7 +572,6 @@ class sparse_allreduce_attr : public ccl_api_base_copyable<sparse_allreduce_attr
     sparse_allreduce_attr(
         const typename detail::ccl_api_type_attr_traits<operation_attr_id,
                                                         operation_attr_id::version>::type& version);
-    ;
 };
 
 /**
diff --git a/include/oneapi/ccl/config.h.in b/include/oneapi/ccl/config.h.in
index 1f2ad4e5f..6fe6015ea 100644
--- a/include/oneapi/ccl/config.h.in
+++ b/include/oneapi/ccl/config.h.in
@@ -30,19 +30,15 @@
 
 #define ONECCL_SPEC_VERSION "1.0"
 
-#define CCL_MAJOR_VERSION                   @CCL_MAJOR_VERSION@
-#define CCL_MINOR_VERSION                   @CCL_MINOR_VERSION@
-#define CCL_UPDATE_VERSION                  @CCL_UPDATE_VERSION@
-#cmakedefine CCL_PRODUCT_STATUS             "@CCL_PRODUCT_STATUS@"
-#cmakedefine CCL_PRODUCT_BUILD_DATE         "@CCL_PRODUCT_BUILD_DATE@"
-#cmakedefine CCL_PRODUCT_FULL               "@CCL_PRODUCT_FULL@"
+#define CCL_MAJOR_VERSION           @CCL_MAJOR_VERSION@
+#define CCL_MINOR_VERSION           @CCL_MINOR_VERSION@
+#define CCL_UPDATE_VERSION          @CCL_UPDATE_VERSION@
+#cmakedefine CCL_PRODUCT_STATUS     "@CCL_PRODUCT_STATUS@"
+#cmakedefine CCL_PRODUCT_BUILD_DATE "@CCL_PRODUCT_BUILD_DATE@"
+#cmakedefine CCL_PRODUCT_FULL       "@CCL_PRODUCT_FULL@"
 
 /* Auto-generated configuration settings for SYCL support */
 #cmakedefine CCL_ENABLE_SYCL
 
-#ifdef CCL_ENABLE_SYCL
-@CCL_ENABLE_SYCL_CHECK_CONTRACT@
-#endif
-
-/* Auto-generated configuration settings for multi GPU support*/
-#cmakedefine MULTI_GPU_SUPPORT
+/* Auto-generated configuration settings for Level Zero support */
+#cmakedefine CCL_ENABLE_ZE
diff --git a/include/oneapi/ccl/device_types.hpp b/include/oneapi/ccl/device_types.hpp
index bcf807348..35367af55 100644
--- a/include/oneapi/ccl/device_types.hpp
+++ b/include/oneapi/ccl/device_types.hpp
@@ -27,7 +27,7 @@ namespace ccl {
 using process_id = size_t;
 using host_id = std::string;
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
 constexpr size_t CCL_GPU_DEVICES_AFFINITY_MASK_SIZE = 4;
 using device_mask_t = std::bitset<CCL_GPU_DEVICES_AFFINITY_MASK_SIZE>;
 using process_aggregated_device_mask_t = std::map<process_id, device_mask_t>;
diff --git a/include/oneapi/ccl/native_device_api/empty/export.hpp b/include/oneapi/ccl/native_device_api/empty/export.hpp
index 44a239c0b..bf6190a3d 100644
--- a/include/oneapi/ccl/native_device_api/empty/export.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/export.hpp
@@ -34,7 +34,7 @@ struct backend_info<CL_BACKEND_TYPE> {
         return CL_BACKEND_TYPE;
     }
     static constexpr const char* name() {
-        return "BACKEND_UNAVAILABLE";
+        return "EMPTY";
     }
 };
 
@@ -59,6 +59,7 @@ struct generic_context_type<CL_BACKEND_TYPE> {
     using impl_t = native::ccl_context;
     using ccl_native_t = std::shared_ptr<impl_t>;
 
+    generic_context_type() = default;
     template <class T>
     generic_context_type(T&& not_used) {
         (void)not_used;
diff --git a/include/oneapi/ccl/native_device_api/export_api.hpp b/include/oneapi/ccl/native_device_api/export_api.hpp
index f4b01b859..17347caae 100644
--- a/include/oneapi/ccl/native_device_api/export_api.hpp
+++ b/include/oneapi/ccl/native_device_api/export_api.hpp
@@ -17,13 +17,13 @@
 #include "oneapi/ccl/config.h"
 
 #ifdef CCL_ENABLE_SYCL
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
 #include "sycl_l0/export.hpp"
 #else
 #include "sycl/export.hpp"
 #endif
 #else
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
 #include "l0/export.hpp"
 #else
 #include "empty/export.hpp"
diff --git a/include/oneapi/ccl/native_device_api/l0/export.hpp b/include/oneapi/ccl/native_device_api/l0/export.hpp
index c56d9d9c4..1e7921ae6 100644
--- a/include/oneapi/ccl/native_device_api/l0/export.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/export.hpp
@@ -31,7 +31,7 @@ struct backend_info<CL_BACKEND_TYPE> {
         return CL_BACKEND_TYPE;
     }
     static constexpr const char* name() {
-        return "LEVEL_ZERO_BACKEND";
+        return "LEVEL_ZERO";
     }
 };
 
diff --git a/include/oneapi/ccl/native_device_api/sycl/export.hpp b/include/oneapi/ccl/native_device_api/sycl/export.hpp
index ba3ce9a34..f205e825b 100644
--- a/include/oneapi/ccl/native_device_api/sycl/export.hpp
+++ b/include/oneapi/ccl/native_device_api/sycl/export.hpp
@@ -29,7 +29,7 @@ struct backend_info<CL_BACKEND_TYPE> {
         return CL_BACKEND_TYPE;
     }
     static constexpr const char* name() {
-        return "DPCPP_BACKEND";
+        return "DPCPP";
     }
 };
 
diff --git a/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp b/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
index ae36064a3..4bae8ed52 100644
--- a/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
+++ b/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
@@ -30,7 +30,7 @@ struct backend_info<CL_BACKEND_TYPE> {
         return CL_BACKEND_TYPE;
     }
     static constexpr const char* name() {
-        return "DPCPP_LEVEL_ZERO_BACKEND";
+        return "DPCPP_LEVEL_ZERO";
     }
 };
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b262ba7bc..72e1eda33 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,112 +19,63 @@ set (EXTENSIONS_SRC)
 
 if (CCL_ENABLE_SYCL)
     list (APPEND EXTENSIONS_SRC
-                    native_device_api/l0/utils.cpp
-                    native_device_api/sycl/export.cpp
-                    native_device_api/interop_utils.cpp
+        native_device_api/l0/utils.cpp
+        native_device_api/sycl/export.cpp
+        native_device_api/interop_utils.cpp
         )
 endif(CCL_ENABLE_SYCL)
 
-if (MULTI_GPU_SUPPORT)
-list (APPEND EXTENSIONS_SRC
-                    ccl_cpp_utils.cpp
-
-                    native_device_api/l0/base.cpp
-                    native_device_api/l0/device.cpp
-                    native_device_api/l0/context.cpp
-                    native_device_api/l0/event_pool.cpp
-                    native_device_api/l0/subdevice.cpp
-                    native_device_api/l0/driver.cpp
-                    native_device_api/l0/export.cpp
-                    native_device_api/l0/platform.cpp
-                    native_device_api/l0/utils.cpp
-                    native_device_api/l0/primitives.cpp
-                    native_device_api/interop_utils.cpp
-
-                    common/comm/l0/comm_context.cpp
-                    common/comm/l0/comm_context_storage.cpp
-                    common/comm/l0/context_comm_addr.cpp
-
-                    common/comm/l0/devices/ccl_gpu_base_comm.cpp
-                    common/comm/l0/devices/ccl_gpu_comm.cpp
-                    common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
-                    common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
-
-                    common/comm/l0/devices/communication_structs/connection.cpp
-                    common/comm/l0/devices/communication_structs/ipc_connection.cpp
-                    common/comm/l0/devices/communication_structs/ipc_server.cpp
-                    common/comm/l0/devices/communication_structs/ipc_client.cpp
-
-                    common/comm/l0/context/process_group_ctx.cpp
-                    common/comm/l0/context/thread_group_ctx.cpp
-                    common/comm/l0/context/device_group_ctx.cpp
-                    common/comm/l0/context/device_storage.cpp
-
-                    common/comm/l0/topology/topology_serializer.cpp
-                    common/comm/l0/topology/ring/device_group_ring_creator.cpp
-                    common/comm/l0/topology/ring/thread_group_ring_creator.cpp
-                    common/comm/l0/topology/ring/process_group_ring_creator.cpp
-                    common/comm/l0/topology/topology_construction_utils.cpp
-
-                    common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
-                    common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
-                    common/comm/l0/context/scale/ipc/ipc_session_key.cpp
-
-                    common/comm/l0/context/scale/base/base_session.cpp
-                    common/comm/l0/context/scale/scale_out/scale_out_session.cpp
-
-                    common/comm/l0/gpu_comm_attr.cpp
-                    common/comm/l0/modules/base_entry_module.cpp
-                    common/comm/l0/modules/modules_source_data.cpp
-                    common/comm/l0/modules/kernel_utils.cpp
-
-                    sched/gpu_sched.cpp
-                    sched/gpu_concurrent_sched.cpp
-                    sched/entry/gpu/ze_cache.cpp
-                    sched/entry/gpu/ze_call.cpp
-                    sched/entry/gpu/ze_primitives.cpp)
-endif(MULTI_GPU_SUPPORT)
-
-if (CCL_ENABLE_SYCL AND MULTI_GPU_SUPPORT)
+if (CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
     list (APPEND EXTENSIONS_SRC
-                    sched/entry/gpu/ze_base_entry.cpp
-                    sched/entry/gpu/ze_allreduce_entry.cpp
-                    sched/entry/gpu/ze_copy_entry.cpp
-                    sched/entry/gpu/ze_handle_exchange_entry.cpp
-                    sched/entry/gpu/ze_event_signal_entry.cpp
-                    sched/entry/gpu/ze_event_wait_entry.cpp
-                    sched/entry/gpu/ze_reduce_entry.cpp
-                    sched/entry/reduce_local_entry.cpp
-                    sched/ze_handle_manager.cpp
+
+        ccl_cpp_utils.cpp
+
+        native_device_api/l0/base.cpp
+        native_device_api/l0/device.cpp
+        native_device_api/l0/context.cpp
+        native_device_api/l0/event_pool.cpp
+        native_device_api/l0/subdevice.cpp
+        native_device_api/l0/driver.cpp
+        native_device_api/l0/export.cpp
+        native_device_api/l0/platform.cpp
+        native_device_api/l0/utils.cpp
+        native_device_api/l0/primitives.cpp
+        native_device_api/interop_utils.cpp
+
+        sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
+        sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
+        sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
+        sched/entry/ze/ze_a2a_allgatherv_entry.cpp
+        sched/entry/ze/ze_a2a_gatherv_entry.cpp
+        sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
+        sched/entry/ze/ze_base_entry.cpp
+        sched/entry/ze/ze_barrier_entry.cpp
+        sched/entry/ze/ze_cache.cpp
+        sched/entry/ze/ze_call.cpp
+        sched/entry/ze/ze_copy_entry.cpp
+        sched/entry/ze/ze_handle_exchange_entry.cpp
+        sched/entry/ze/ze_event_signal_entry.cpp
+        sched/entry/ze/ze_event_wait_entry.cpp
+        sched/entry/ze/ze_onesided_reduce_entry.cpp
+        sched/entry/ze/ze_primitives.cpp
+        sched/entry/ze/ze_reduce_local_entry.cpp
+
+        sched/ze/ze_event_manager.cpp
+        sched/ze/ze_handle_manager.cpp
+        sched/ze/ze_ipc_event_pool_manager.cpp
+        sched/ze/ze_list_manager.cpp
+
+        coll/coll_util.cpp
         )
-endif(CCL_ENABLE_SYCL AND MULTI_GPU_SUPPORT)
+endif(CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
 
 set(CCL_SRC
-    ccl_cpp_communicator.cpp
-    ccl_cpp_environment.cpp
-    ccl_api_functions.cpp
-    ccl_app_api_coll_attr.cpp
-    ccl_app_api_comm_attr.cpp
-    ccl_app_api_comm_split_attr.cpp
-    ccl_app_api_datatype_attr.cpp
-    ccl_app_api_kvs_attr.cpp
-    ccl_app_api_event.cpp
-    ccl_app_api_init_attr.cpp
-    ccl_cpp_kvs.cpp
-    ccl_cpp_device.cpp
-    ccl_cpp_stream.cpp
-    ccl_cpp_context.cpp
-    ccl_cpp_utils.cpp
-    ccl_empty_attr.cpp
-    ccl_empty_coll_attr.cpp
-    ccl_empty_comm_attr.cpp
-    ccl_empty_init_attr.cpp
-    ccl_empty_comm_split_attr.cpp
-    ccl_empty_kvs_attr.cpp
-    ccl_empty_stream.cpp
-    native_device_api/sycl_l0/export.cpp
-    native_device_api/empty/export.cpp
-    atl/atl_wrapper.cpp
+
+    atl/atl_base_comm.cpp
+    atl/atl_def.cpp
+    atl/ofi/atl_ofi_comm.cpp
+    atl/mpi/atl_mpi_comm.cpp
+    atl/mpi/atl_mpi_global_data.cpp
     atl/mpi/atl_mpi.cpp
     atl/ofi/atl_ofi.cpp
     atl/ofi/atl_ofi_helper.cpp
@@ -134,16 +85,15 @@ set(CCL_SRC
     atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
     atl/util/pm/pmi_rt/pmi_simple.cpp
     atl/util/pm/pmi_rt/pmi/simple_pmi.c
     atl/util/pm/pmi_rt/pmi/simple_pmiutil.c
+
     coll/coll_common_attributes.cpp
     coll/ccl_allgather_op_attr.cpp
     coll/ccl_allreduce_op_attr.cpp
@@ -179,59 +129,93 @@ set(CCL_SRC
     coll/selection/selector_reduce.cpp
     coll/selection/selector_reduce_scatter.cpp
     coll/selection/selector_sparse_allreduce.cpp
+
+    common/comm/atl_tag.cpp
+    common/comm/comm.cpp
+    common/comm/compiler_comm_interface_dispatcher.cpp
+    common/context/context.cpp
+    common/datatype/datatype.cpp
+    common/device/device.cpp
+    common/env/env.cpp
+    common/event/ccl_event.cpp
+    common/event/impls/host_event.cpp
+    common/event/impls/native_event.cpp
+    common/framework/framework.cpp
+    common/global/global.cpp
+    common/log/log.cpp
+    common/request/request.cpp
+    common/stream/stream.cpp
+    common/utils/memcpy.cpp
+    common/utils/spinlock.cpp
+    common/utils/version.cpp
+    common/utils/yield.cpp
+
     comp/bf16/bf16.cpp
     comp/bf16/bf16_intrisics.cpp
     comp/comp.cpp
     comp/fp16/fp16.cpp
     comp/fp16/fp16_intrisics.cpp
+
+    exec/exec.cpp
+    exec/thread/base_thread.cpp
+    exec/thread/listener.cpp
+    exec/thread/service_worker.cpp
+    exec/thread/worker.cpp
+
+    fusion/fusion.cpp
+
     hwloc/hwloc_wrapper.cpp
-    sched/sched.cpp
-    sched/buffer_cache.cpp
-    sched/extra_sched.cpp
-    sched/master_sched.cpp
-    sched/sched_base.cpp
-    sched/sched_timer.cpp
+
+    native_device_api/sycl_l0/export.cpp
+    native_device_api/empty/export.cpp
+
+    parallelizer/parallelizer.cpp
+
+    sched/buffer/buffer_cache.cpp
+    sched/buffer/buffer_manager.cpp
     sched/cache/cache.cpp
     sched/cache/key.cpp
-    sched/queue/flow_control.cpp
-    sched/queue/strict_queue.cpp
-    sched/queue/queue.cpp
     sched/entry/coll/coll_entry.cpp
     sched/entry/coll/coll_entry_helper.cpp
     sched/entry/copy/copy_entry.cpp
     sched/entry/copy/copy_helper.cpp
     sched/entry/entry.cpp
     sched/entry/factory/chunked_entry_factory.cpp
-    exec/exec.cpp
-    exec/thread/base_thread.cpp
-    exec/thread/listener.cpp
-    exec/thread/service_worker.cpp
-    exec/thread/worker.cpp
-    fusion/fusion.cpp
-    parallelizer/parallelizer.cpp
-    unordered_coll/unordered_coll.cpp
-
-    common/comm/atl_tag.cpp
-    common/comm/comm.cpp
-    common/comm/compiler_comm_interface_dispatcher.cpp
-    common/comm/host_communicator/host_communicator.cpp
+    sched/entry/recv_copy_entry.cpp
+    sched/entry/reduce_local_entry.cpp
+    sched/queue/flow_control.cpp
+    sched/queue/queue.cpp
+    sched/queue/strict_queue.cpp
+    sched/extra_sched.cpp
+    sched/master_sched.cpp
+    sched/sched.cpp
+    sched/sched_base.cpp
+    sched/sched_timer.cpp
 
-    common/context/context.cpp
-    common/datatype/datatype.cpp
-    common/device/device.cpp
-    common/event/ccl_event.cpp
-    common/stream/stream.cpp
+    unordered_coll/unordered_coll.cpp
 
-    common/env/env.cpp
-    common/global/global.cpp
-    common/log/log.cpp
-    common/event/impls/host_event.cpp
-    common/event/impls/native_event.cpp
-    common/framework/framework.cpp
-    common/request/request.cpp
-    common/utils/spinlock.cpp
-    common/utils/version.cpp
-    common/utils/yield.cpp
+    ccl_api_functions.cpp
+    ccl_app_api_coll_attr.cpp
+    ccl_app_api_comm_attr.cpp
+    ccl_app_api_comm_split_attr.cpp
+    ccl_app_api_datatype_attr.cpp
+    ccl_app_api_event.cpp
+    ccl_app_api_init_attr.cpp
+    ccl_app_api_kvs_attr.cpp
+    ccl_cpp_communicator.cpp
+    ccl_cpp_context.cpp
+    ccl_cpp_device.cpp
+    ccl_cpp_environment.cpp
+    ccl_cpp_kvs.cpp
+    ccl_cpp_stream.cpp
+    ccl_cpp_utils.cpp
+    ccl_empty_attr.cpp
+    ccl_empty_coll_attr.cpp
+    ccl_empty_comm_attr.cpp
+    ccl_empty_comm_split_attr.cpp
+    ccl_empty_init_attr.cpp
+    ccl_empty_kvs_attr.cpp
+    ccl_empty_stream.cpp
 
     ${EXTENSIONS_SRC})
 
@@ -262,6 +246,8 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL
         set(SRC_C_FLAGS "${SRC_C_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
         set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -prof-gen=srcpos -prof-src-root-cwd")
     endif()
+    #To suppress for 'offsetof applied to non-POD (Plain Old Data) types is nonstandar'
+    set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -diag-disable=1875")
 endif()
 
 list(APPEND SRC_INCLUDE_DIRS
@@ -308,11 +294,6 @@ endif()
 add_library(ccl SHARED $<TARGET_OBJECTS:ccl-objects>)
 target_include_directories(ccl PUBLIC ${SRC_INCLUDE_DIRS})
 
-# link with release_mt libmpi.so for oneAPI Base toolkit
-# libccl.so -> cpu_icc/cpu_gpu_dpcpp -> lib -> latest -> ccl -> mpi -> ...
-set(ONEAPI_IMPI_RPATH "'$ORIGIN'/../../../../mpi/latest/lib/release_mt/")
-set_target_properties(ccl PROPERTIES LINK_FLAGS "-Wl,-rpath,${ONEAPI_IMPI_RPATH}")
-
 target_link_libraries(ccl PUBLIC ${SRC_LINK_LIBS})
 
 if (NOT LIB_SO_VERSION AND NOT LIB_MAJOR_VERSION)
@@ -330,7 +311,6 @@ message(STATUS "SRC LINK_LIBS: ${SRC_LINK_LIBS}")
 
 install(TARGETS ccl LIBRARY DESTINATION ${CCL_INSTALL_LIB})
 install(FILES
-    "${PROJECT_SOURCE_DIR}/cmake/FindComputeCpp.cmake"
     "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL.cmake"
     "${PROJECT_SOURCE_DIR}/cmake/FindIntelSYCL_level_zero.cmake"
     "${PROJECT_SOURCE_DIR}/cmake/Findlevel_zero.cmake"
diff --git a/src/atl/atl.h b/src/atl/atl.h
deleted file mode 100644
index c515fa483..000000000
--- a/src/atl/atl.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <memory>
-
-#include "atl_def.h"
-#include "common/log/log.hpp"
-#include "util/pm/pm_rt.h"
-
-#ifdef __cplusplus
-class iatl {
-public:
-    virtual ~iatl() = default;
-
-    virtual atl_status_t atl_init(int* argc,
-                                  char*** argv,
-                                  atl_attr_t* attr,
-                                  const char* main_addr,
-                                  std::unique_ptr<ipmi>& pmi) = 0;
-
-    virtual atl_status_t atl_finalize() = 0;
-
-    virtual atl_status_t atl_update(std::unique_ptr<ipmi>& pmi) = 0;
-
-    virtual atl_ep_t** atl_get_eps() = 0;
-
-    virtual atl_proc_coord_t* atl_get_proc_coord() = 0;
-
-    virtual atl_status_t atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) = 0;
-
-    virtual atl_status_t atl_mr_dereg(atl_mr_t* mr) = 0;
-
-    virtual atl_status_t atl_ep_send(atl_ep_t* ep,
-                                     const void* buf,
-                                     size_t len,
-                                     int dst_proc_idx,
-                                     uint64_t tag,
-                                     atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_recv(atl_ep_t* ep,
-                                     void* buf,
-                                     size_t len,
-                                     int src_proc_idx,
-                                     uint64_t tag,
-                                     atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_probe(atl_ep_t* ep,
-                                      int src_proc_idx,
-                                      uint64_t tag,
-                                      int* found,
-                                      size_t* recv_len) = 0;
-
-    virtual atl_status_t atl_ep_allgatherv(atl_ep_t* ep,
-                                           const void* send_buf,
-                                           size_t send_len,
-                                           void* recv_buf,
-                                           const int* recv_lens,
-                                           const int* offsets,
-                                           atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_allreduce(atl_ep_t* ep,
-                                          const void* send_buf,
-                                          void* recv_buf,
-                                          size_t len,
-                                          atl_datatype_t dtype,
-                                          atl_reduction_t op,
-                                          atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_alltoall(atl_ep_t* ep,
-                                         const void* send_buf,
-                                         void* recv_buf,
-                                         int len,
-                                         atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_alltoallv(atl_ep_t* ep,
-                                          const void* send_buf,
-                                          const int* send_lens,
-                                          const int* send_offsets,
-                                          void* recv_buf,
-                                          const int* recv_lens,
-                                          const int* recv_offsets,
-                                          atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_bcast(atl_ep_t* ep,
-                                      void* buf,
-                                      size_t len,
-                                      int root,
-                                      atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_reduce(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t len,
-                                       int root,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_reduce_scatter(atl_ep_t* ep,
-                                               const void* send_buf,
-                                               void* recv_buf,
-                                               size_t recv_len,
-                                               atl_datatype_t dtype,
-                                               atl_reduction_t op,
-                                               atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_read(atl_ep_t* ep,
-                                     void* buf,
-                                     size_t len,
-                                     atl_mr_t* mr,
-                                     uint64_t addr,
-                                     uintptr_t remote_key,
-                                     int dst_proc_idx,
-                                     atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_write(atl_ep_t* ep,
-                                      const void* buf,
-                                      size_t len,
-                                      atl_mr_t* mr,
-                                      uint64_t addr,
-                                      uintptr_t remote_key,
-                                      int dst_proc_idx,
-                                      atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_wait_all(atl_ep_t* ep, atl_req_t* req, size_t count) = 0;
-
-    virtual atl_status_t atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) = 0;
-
-    virtual atl_status_t atl_ep_poll(atl_ep_t* ep) = 0;
-
-    virtual atl_status_t atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) = 0;
-    virtual bool is_inited() = 0;
-};
-#endif
diff --git a/src/atl/atl_base_comm.cpp b/src/atl/atl_base_comm.cpp
new file mode 100644
index 000000000..3388da829
--- /dev/null
+++ b/src/atl/atl_base_comm.cpp
@@ -0,0 +1,157 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifdef CCL_ENABLE_MPI
+#include "atl/mpi/atl_mpi.hpp"
+#include "atl/mpi/atl_mpi_comm.hpp"
+#endif // CCL_ENABLE_MPI
+
+#include "atl/atl_base_comm.hpp"
+#include "atl/ofi/atl_ofi_comm.hpp"
+#include "atl/ofi/atl_ofi.hpp"
+#include "atl/util/pm/pm_rt.h"
+#include "exec/exec.hpp"
+
+atl_attr_t atl_base_comm::attr = {
+    /* in */
+    {
+        0, /* enable_shm */
+        0, /* enable_rma */
+        0, /* enable_hmem */
+        0, /* enable_sync_coll */
+        0, /* enable_extra_ep */
+        1, /* ep_count */
+        ATL_MNIC_NONE, /* mnic_type */
+        "", /* mnic_name */
+        1, /* mnic_count */
+        ATL_MNIC_OFFSET_NONE /* mnic_offset */
+    },
+
+    /* out */
+    {
+        0, /* enable_shm */
+        0, /* enable_rma */
+        0, /* enable_hmem */
+        ATL_MNIC_NONE, /* mnic_type */
+        0, /* mnic_count */
+        0, /* tag_bits */
+        0, /* max_tag */
+        0, /* max_order_waw_size */
+    }
+};
+
+ccl_executor* atl_base_comm::executor = nullptr;
+
+void atl_base_comm::init_tag() {
+    tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
+    if (rank == 0) {
+        LOG_DEBUG("atl tag: ", tag->to_string());
+    }
+}
+
+void atl_base_comm::print_atl_attrs() {
+    std::stringstream ss;
+
+    ss << "atl attrs:\n{\n"
+       << "  in: { "
+       << "shm: " << attr.in.enable_shm << ", hmem: " << attr.in.enable_hmem
+       << ", sync_coll: " << attr.in.enable_sync_coll << ", extra_ep: " << attr.in.enable_extra_ep
+       << ", ep_count: " << attr.in.ep_count << ", mnic_type: " << to_string(attr.in.mnic_type)
+       << ", mnic_count: " << attr.in.mnic_count
+       << ", mnic_offset: " << to_string(attr.in.mnic_offset) << " }\n"
+       << "  out: { "
+       << "shm: " << attr.out.enable_shm << ", hmem: " << attr.out.enable_hmem
+       << ", mnic_type: " << to_string(attr.out.mnic_type)
+       << ", mnic_count: " << attr.out.mnic_count << ", tag_bits: " << attr.out.tag_bits
+       << ", max_tag: " << attr.out.max_tag << " }\n}";
+
+    LOG_INFO(ss.str());
+}
+
+void atl_base_comm::executor_update() {
+    if (!executor->are_workers_started()) {
+        if (rank < coord.local_count)
+            LOG_INFO(
+                "start workers for local process [", coord.local_idx, ":", coord.local_count, "]");
+        executor->start_workers(coord.local_idx, coord.local_count);
+    }
+}
+
+std::shared_ptr<atl_base_comm> atl_comm_manager::create_comm() {
+    std::shared_ptr<atl_base_comm> atl_comm;
+
+    auto transport_type = ccl::global_data::env().atl_transport;
+
+    switch (transport_type) {
+        case ccl_atl_ofi: atl_comm = std::shared_ptr<atl_base_comm>(new atl_ofi_comm()); break;
+#ifdef CCL_ENABLE_MPI
+        case ccl_atl_mpi: atl_comm = std::shared_ptr<atl_base_comm>(new atl_mpi_comm()); break;
+#endif // CCL_ENABLE_MPI
+        default: LOG_ERROR("Unsupported yet"); break;
+    }
+    return atl_comm;
+}
+
+std::shared_ptr<atl_base_comm> atl_comm_manager::create_comm(std::shared_ptr<ikvs_wrapper> k) {
+    std::shared_ptr<atl_base_comm> atl_comm;
+
+    auto transport_type = ccl::global_data::env().atl_transport;
+
+    switch (transport_type) {
+        case ccl_atl_ofi: atl_comm = std::shared_ptr<atl_base_comm>(new atl_ofi_comm(k)); break;
+#ifdef CCL_ENABLE_MPI
+        case ccl_atl_mpi: atl_comm = std::shared_ptr<atl_base_comm>(new atl_mpi_comm(k)); break;
+#endif // CCL_ENABLE_MPI
+        default: LOG_ERROR("Unsupported yet"); break;
+    }
+    return atl_comm;
+}
+
+std::shared_ptr<atl_base_comm> atl_comm_manager::create_comm(int total_rank_count,
+                                                             const std::vector<int>& ranks,
+                                                             std::shared_ptr<ikvs_wrapper> k) {
+    std::shared_ptr<atl_base_comm> atl_comm;
+
+    auto transport_type = ccl::global_data::env().atl_transport;
+
+    switch (transport_type) {
+        case ccl_atl_ofi:
+            atl_comm = std::shared_ptr<atl_base_comm>(new atl_ofi_comm(total_rank_count, ranks, k));
+            break;
+#ifdef CCL_ENABLE_MPI
+        case ccl_atl_mpi:
+            atl_comm = std::shared_ptr<atl_base_comm>(new atl_mpi_comm(total_rank_count, ranks, k));
+            break;
+#endif // CCL_ENABLE_MPI
+        default: LOG_ERROR("Unsupported yet"); break;
+    }
+    return atl_comm;
+}
+
+void atl_comm_manager::set_internal_env(const atl_attr_t& attr) {
+    auto transport_type = ccl::global_data::env().atl_transport;
+    atl_base_comm::attr = attr;
+
+    if (transport_type == ccl_atl_ofi)
+        atl_ofi::atl_set_env(attr);
+#ifdef CCL_ENABLE_MPI
+    else if (transport_type == ccl_atl_mpi)
+        atl_mpi::set_env(attr);
+#endif // CCL_ENABLE_MPI
+}
+
+void atl_comm_manager::set_exec(ccl_executor* exec) {
+    atl_base_comm::executor = exec;
+}
diff --git a/src/atl/atl_base_comm.hpp b/src/atl/atl_base_comm.hpp
new file mode 100644
index 000000000..189dbcf65
--- /dev/null
+++ b/src/atl/atl_base_comm.hpp
@@ -0,0 +1,209 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <list>
+#include <vector>
+
+#include "atl/atl_def.h"
+#include "common/comm/atl_tag.hpp"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h"
+
+class ccl_executor;
+
+class atl_base_comm {
+protected:
+    atl_base_comm() = default;
+
+public:
+    virtual ~atl_base_comm() = default;
+
+    virtual atl_status_t main_addr_reserve(char* main_addr) = 0;
+
+    virtual atl_status_t finalize() = 0;
+
+    virtual atl_status_t update() = 0;
+
+    virtual atl_status_t wait_notification() = 0;
+
+    virtual atl_status_t set_resize_function(atl_resize_fn_t fn) = 0;
+
+    virtual atl_status_t mr_reg(const void* buf, size_t len, atl_mr_t** mr) = 0;
+
+    virtual atl_status_t mr_dereg(atl_mr_t* mr) = 0;
+
+    virtual atl_status_t send(size_t ep_idx,
+                              const void* buf,
+                              size_t len,
+                              int dst_proc_idx,
+                              uint64_t tag,
+                              atl_req_t* req) = 0;
+
+    virtual atl_status_t recv(size_t ep_idx,
+                              void* buf,
+                              size_t len,
+                              int src_proc_idx,
+                              uint64_t tag,
+                              atl_req_t* req) = 0;
+
+    virtual atl_status_t probe(size_t ep_idx,
+                               int src_proc_idx,
+                               uint64_t tag,
+                               int* found,
+                               size_t* recv_len) = 0;
+
+    virtual atl_status_t allgatherv(size_t ep_idx,
+                                    const void* send_buf,
+                                    size_t send_len,
+                                    void* recv_buf,
+                                    const int* recv_lens,
+                                    const int* offsets,
+                                    atl_req_t* req) = 0;
+
+    virtual atl_status_t allreduce(size_t ep_idx,
+                                   const void* send_buf,
+                                   void* recv_buf,
+                                   size_t len,
+                                   atl_datatype_t dtype,
+                                   atl_reduction_t op,
+                                   atl_req_t* req) = 0;
+
+    virtual atl_status_t alltoall(size_t ep_idx,
+                                  const void* send_buf,
+                                  void* recv_buf,
+                                  int len,
+                                  atl_req_t* req) = 0;
+
+    virtual atl_status_t alltoallv(size_t ep_idx,
+                                   const void* send_buf,
+                                   const int* send_lens,
+                                   const int* send_offsets,
+                                   void* recv_buf,
+                                   const int* recv_lens,
+                                   const int* recv_offsets,
+                                   atl_req_t* req) = 0;
+
+    virtual atl_status_t barrier(size_t ep_idx, atl_req_t* req) = 0;
+
+    virtual atl_status_t bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t* req) = 0;
+
+    virtual atl_status_t reduce(size_t ep_idx,
+                                const void* send_buf,
+                                void* recv_buf,
+                                size_t len,
+                                int root,
+                                atl_datatype_t dtype,
+                                atl_reduction_t op,
+                                atl_req_t* req) = 0;
+
+    virtual atl_status_t reduce_scatter(size_t ep_idx,
+                                        const void* send_buf,
+                                        void* recv_buf,
+                                        size_t recv_len,
+                                        atl_datatype_t dtype,
+                                        atl_reduction_t op,
+                                        atl_req_t* req) = 0;
+
+    virtual atl_status_t read(size_t ep_idx,
+                              void* buf,
+                              size_t len,
+                              atl_mr_t* mr,
+                              uint64_t addr,
+                              uintptr_t remote_key,
+                              int dst_proc_idx,
+                              atl_req_t* req) = 0;
+
+    virtual atl_status_t write(size_t ep_idx,
+                               const void* buf,
+                               size_t len,
+                               atl_mr_t* mr,
+                               uint64_t addr,
+                               uintptr_t remote_key,
+                               int dst_proc_idx,
+                               atl_req_t* req) = 0;
+
+    virtual atl_status_t wait(size_t ep_idx, atl_req_t* req) = 0;
+
+    virtual atl_status_t wait_all(size_t ep_idx, atl_req_t* req, size_t count) = 0;
+
+    virtual atl_status_t cancel(size_t ep_idx, atl_req_t* req) = 0;
+
+    virtual atl_status_t poll(size_t ep_idx) = 0;
+
+    virtual atl_status_t check(size_t ep_idx, atl_req_t* req) = 0;
+
+    virtual size_t get_threads_per_process() = 0;
+
+    virtual size_t get_ranks_per_process() = 0;
+
+    virtual int get_rank() = 0;
+
+    virtual int get_size() = 0;
+
+    virtual int get_r2r_color() = 0;
+
+    virtual int get_host_color() = 0;
+
+    virtual std::shared_ptr<atl_base_comm> comm_split(int color) = 0;
+
+    virtual std::vector<int> get_rank2rank_map() = 0;
+
+    /*
+     * TODO: Temporary change.
+     * Need to define correct to unique id
+     */
+    virtual size_t get_id() = 0;
+    std::unique_ptr<ccl_atl_tag> tag;
+    static atl_attr_t attr;
+
+protected:
+    void init_tag();
+    void print_atl_attrs();
+    void executor_update();
+
+    friend class atl_comm_manager;
+    static ccl_executor* executor;
+
+    int rank;
+    int size;
+
+    size_t threads_per_process;
+    size_t ranks_per_process;
+
+    std::vector<int> rank2rank_map;
+    atl_proc_coord_t coord;
+    int parent_rank;
+    int parent_size;
+
+    std::shared_ptr<ipmi> pmi;
+};
+
+class atl_comm_manager {
+public:
+    static std::shared_ptr<atl_base_comm> create_comm();
+
+    static std::shared_ptr<atl_base_comm> create_comm(std::shared_ptr<ikvs_wrapper> k);
+
+    static std::shared_ptr<atl_base_comm> create_comm(int total_rank_count,
+                                                      const std::vector<int>& ranks,
+                                                      std::shared_ptr<ikvs_wrapper> k);
+    static void set_internal_env(const atl_attr_t& attr);
+    static void set_exec(ccl_executor* exec);
+};
diff --git a/src/atl/atl_def.cpp b/src/atl/atl_def.cpp
new file mode 100644
index 000000000..16335b70d
--- /dev/null
+++ b/src/atl/atl_def.cpp
@@ -0,0 +1,45 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "atl/atl_def.h"
+
+std::map<atl_mnic_t, std::string> mnic_type_names = { std::make_pair(ATL_MNIC_NONE, "none"),
+                                                      std::make_pair(ATL_MNIC_LOCAL, "local"),
+                                                      std::make_pair(ATL_MNIC_GLOBAL, "global") };
+
+std::map<atl_mnic_offset_t, std::string> mnic_offset_names = {
+    std::make_pair(ATL_MNIC_OFFSET_NONE, "none"),
+    std::make_pair(ATL_MNIC_OFFSET_LOCAL_PROC_IDX, "local_proc_idx")
+};
+
+std::string to_string(atl_mnic_t type) {
+    auto it = mnic_type_names.find(type);
+    if (it != mnic_type_names.end()) {
+        return it->second;
+    }
+    else {
+        return "unknown";
+    }
+}
+
+std::string to_string(atl_mnic_offset_t offset) {
+    auto it = mnic_offset_names.find(offset);
+    if (it != mnic_offset_names.end()) {
+        return it->second;
+    }
+    else {
+        return "unknown";
+    }
+}
diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h
index 3e02a0aa8..557c164f2 100644
--- a/src/atl/atl_def.h
+++ b/src/atl/atl_def.h
@@ -15,10 +15,16 @@
 */
 #pragma once
 
+#include <cstring>
+#include <map>
+#include <memory>
+#include <vector>
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
 
+#include "common/log/log.hpp"
+
 #ifndef container_of
 #define container_of(ptr, type, field) ((type*)((char*)ptr - offsetof(type, field)))
 #endif
@@ -45,11 +51,36 @@
  * This is invoked by the ATL framework when the transport library is loaded.
  */
 
+#define ATL_CHECK_STATUS(expr, str) \
+    do { \
+        if (expr != ATL_STATUS_SUCCESS) { \
+            LOG_ERROR(str); \
+            return ATL_STATUS_FAILURE; \
+        } \
+    } while (0)
+
+#define KVS_2_ATL_CHECK_STATUS(expr, str) \
+    do { \
+        if (expr != KVS_STATUS_SUCCESS) { \
+            LOG_ERROR(str); \
+            return ATL_STATUS_FAILURE; \
+        } \
+    } while (0)
+
+#define ATL_SET_STR(dst, size, ...) \
+    do { \
+        if (snprintf(dst, size, __VA_ARGS__) > size) { \
+            printf("line too long (must be shorter %d)\n", size); \
+            printf(__VA_ARGS__); \
+            return ATL_STATUS_FAILURE; \
+        } \
+    } while (0)
+
 #define ATL_CALL(func, err_action) \
     do { \
         atl_status_t status = func; \
         if (status != FI_SUCCESS) { \
-            CCL_THROW(#func "\n fails with status: ", status); \
+            LOG_ERROR(#func "\n fails with status: ", status); \
             err_action; \
         } \
     } while (0)
@@ -107,6 +138,13 @@ typedef enum {
 } atl_reduction_t;
 
 typedef enum { ATL_MNIC_NONE, ATL_MNIC_LOCAL, ATL_MNIC_GLOBAL } atl_mnic_t;
+typedef enum { ATL_MNIC_OFFSET_NONE, ATL_MNIC_OFFSET_LOCAL_PROC_IDX } atl_mnic_offset_t;
+
+extern std::map<atl_mnic_t, std::string> mnic_type_names;
+extern std::map<atl_mnic_offset_t, std::string> mnic_offset_names;
+
+std::string to_string(atl_mnic_t type);
+std::string to_string(atl_mnic_offset_t offset);
 
 typedef struct {
     struct {
@@ -119,6 +157,7 @@ typedef struct {
         atl_mnic_t mnic_type;
         std::string mnic_name;
         size_t mnic_count;
+        atl_mnic_offset_t mnic_offset;
     } in;
     struct {
         int enable_shm;
@@ -147,17 +186,18 @@ typedef struct {
     size_t hostname_hash;
 } atl_proc_coord_t;
 
-typedef struct {
-    uint64_t tag;
-    size_t remote_proc_idx;
+typedef struct atl_req {
+    int is_completed;
     void* internal[ATL_REQ_SIZE];
-} atl_req_t __attribute__((aligned(ATL_CACHELINE_LEN)));
+    atl_req() : is_completed(0) {
+        memset(internal, 0, ATL_REQ_SIZE * sizeof(void*));
+    }
+} atl_req_t;
 
 struct atl_ctx {
     atl_proc_coord_t coord;
 
     size_t ep_count;
-    atl_ep_t** eps;
 };
 
 /*
diff --git a/src/atl/atl_wrapper.cpp b/src/atl/atl_wrapper.cpp
deleted file mode 100644
index 00eba6f9d..000000000
--- a/src/atl/atl_wrapper.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h"
-#include "atl/util/pm/pmi_rt/pmi_simple.h"
-#include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
-#include "atl/util/pm/pmi_resizable_rt/pmi_resizable.h"
-#include "atl/ofi/atl_ofi.hpp"
-#ifdef CCL_ENABLE_MPI
-#include "atl/mpi/atl_mpi.hpp"
-#endif // CCL_ENABLE_MPI
-#include "atl_wrapper.h"
-#include "common/global/global.hpp"
-#include "exec/exec.hpp"
-#include "util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h"
-
-static std::list<std::shared_ptr<iatl>> transports{};
-static ccl_executor* executor;
-
-atl_attr_t atl_wrapper::attr = {
-    /* in */
-    {
-        0, /* enable_shm */
-        0, /* enable_rma */
-        0, /* enable_hmem */
-        0, /* enable_sync_coll */
-        0, /* enable_extra_ep */
-        1, /* ep_count */
-        ATL_MNIC_NONE, /* mnic_type */
-        "", /* mnic_name */
-        1 /* mnic_count */
-    },
-
-    /* out */
-    {}
-};
-
-void atl_wrapper::set_internal_env(const atl_attr_t& attr) {
-    auto transport_type = ccl::global_data::env().atl_transport;
-
-    if (transport_type == ccl_atl_ofi)
-        atl_ofi::atl_set_env(attr);
-#ifdef CCL_ENABLE_MPI
-    else if (transport_type == ccl_atl_mpi)
-        atl_mpi::atl_set_env(attr);
-#endif // CCL_ENABLE_MPI
-}
-
-void atl_wrapper::set_exec(ccl_executor* exec) {
-    executor = exec;
-}
-
-atl_wrapper::atl_wrapper() {
-    auto transport_type = ccl::global_data::env().atl_transport;
-
-    char* pm_type_str;
-    switch (transport_type) {
-        case ccl_atl_ofi:
-            pm_type_str = getenv(PM_TYPE);
-            if (pm_type_str) {
-                if (strstr(pm_type_str, PM_RT_VAL_SIMPLE)) {
-                    pmi = std::unique_ptr<ipmi>(new pmi_simple());
-                }
-                else if (strstr(pm_type_str, PM_RT_VAL_RESIZABLE)) {
-                    std::shared_ptr<ikvs_wrapper> k(new internal_kvs());
-                    pmi = std::unique_ptr<ipmi>(new pmi_resizable(k));
-                }
-                else {
-                    LOG_ERROR("Unknown %s: %s\n", PM_TYPE, pm_type_str);
-                }
-            }
-            else {
-                pmi = std::unique_ptr<ipmi>(new pmi_simple());
-            }
-            transport = std::shared_ptr<iatl>(new atl_ofi());
-            break;
-#ifdef CCL_ENABLE_MPI
-        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
-#endif // CCL_ENABLE_MPI
-        default: LOG_ERROR("Unsupported yet"); break;
-    }
-
-    init_transport();
-}
-
-atl_wrapper::atl_wrapper(std::shared_ptr<ikvs_wrapper> k) {
-    auto transport_type = ccl::global_data::env().atl_transport;
-
-    char* pm_type_str;
-    switch (transport_type) {
-        case ccl_atl_ofi:
-            pm_type_str = getenv(PM_TYPE);
-            if (pm_type_str) {
-                if (strstr(pm_type_str, PM_RT_VAL_SIMPLE)) {
-                    pmi = std::unique_ptr<ipmi>(new pmi_simple());
-                }
-                else if (strstr(pm_type_str, PM_RT_VAL_RESIZABLE)) {
-                    pmi = std::unique_ptr<ipmi>(new pmi_resizable(k));
-                }
-                else {
-                    LOG_ERROR("Unknown %s: %s\n", PM_TYPE, pm_type_str);
-                }
-            }
-            else {
-                pmi = std::unique_ptr<ipmi>(new pmi_simple());
-            }
-            transport = std::shared_ptr<iatl>(new atl_ofi());
-            break;
-#ifdef CCL_ENABLE_MPI
-        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
-#endif // CCL_ENABLE_MPI
-        default: LOG_ERROR("Unsupported yet"); break;
-    }
-
-    init_transport();
-}
-
-atl_wrapper::atl_wrapper(int total_rank_count,
-                         const std::vector<int>& ranks,
-                         std::shared_ptr<ikvs_wrapper> k) {
-    auto transport_type = ccl::global_data::env().atl_transport;
-
-    switch (transport_type) {
-        case ccl_atl_ofi: {
-            size_t transorts_count = transports.size();
-            std::shared_ptr<internal_kvs> kvs;
-            if ((kvs = std::dynamic_pointer_cast<internal_kvs>(k)) != nullptr) {
-                pmi = std::unique_ptr<ipmi>(
-                    new pmi_resizable_simple_internal(total_rank_count, ranks, kvs));
-            }
-            else {
-                pmi = std::unique_ptr<ipmi>(new pmi_resizable_simple(total_rank_count, ranks, k));
-            }
-
-            if (pmi->get_local_thread_idx() == 0) {
-                transports.push_back(std::shared_ptr<iatl>(new atl_ofi()));
-            }
-            //TODO: Rework it on barrier
-            while (transorts_count == transports.size()) {
-                ccl_yield(ccl::global_data::env().yield_type);
-            }
-            static std::mutex memory_mutex;
-            {
-                std::lock_guard<std::mutex> lock(memory_mutex);
-                transport = transports.back();
-            }
-        } break;
-#ifdef CCL_ENABLE_MPI
-        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
-#endif // CCL_ENABLE_MPI
-        default: LOG_ERROR("Unsupported yet"); break;
-    }
-
-    init_transport();
-}
-void atl_wrapper::init_transport() {
-    LOG_DEBUG("init ATL, requested ep_count ", attr.in.ep_count);
-    static std::mutex memory_mutex;
-    {
-        std::lock_guard<std::mutex> lock(memory_mutex);
-        if (!transport->is_inited()) {
-            CCL_THROW_IF_NOT(
-                transport->atl_init(nullptr, nullptr, &attr, nullptr, pmi) == ATL_STATUS_SUCCESS,
-                "failed to initialize ATL");
-        }
-    }
-    eps = transport->atl_get_eps();
-    tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
-
-    if (pmi) {
-        threads_per_process = pmi->get_threads_per_process();
-        ranks_per_process = pmi->get_ranks_per_process();
-        rank = pmi->get_rank();
-        size = pmi->get_size();
-    }
-#ifdef CCL_ENABLE_MPI
-    else {
-        threads_per_process = 1;
-        ranks_per_process = 1;
-        rank = static_cast<atl_mpi*>(transport.get())->get_rank();
-        size = static_cast<atl_mpi*>(transport.get())->get_size();
-    }
-#endif // CCL_ENABLE_MPI
-
-    if (rank == 0) {
-        tag->print();
-        LOG_INFO("atl-in-attrs:");
-        LOG_INFO("  enable_shm: ", attr.in.enable_shm);
-        LOG_INFO("  enable_rma: ", attr.in.enable_rma);
-        LOG_INFO("  enable_hmem: ", attr.in.enable_hmem);
-        LOG_INFO("  enable_sync_coll: ", attr.in.enable_sync_coll);
-        LOG_INFO("  enable_extra_ep: ", attr.in.enable_extra_ep);
-        LOG_INFO("  ep_count: ", attr.in.ep_count);
-        LOG_INFO("  mnic_type: ", attr.in.mnic_type);
-        LOG_INFO("  mnic_count: ", attr.in.mnic_count);
-
-        LOG_INFO("atl-out-attrs:");
-        LOG_INFO("  enable_shm: ", attr.out.enable_shm);
-        LOG_INFO("  enable_rma: ", attr.out.enable_rma);
-        LOG_INFO("  enable_hmem: ", attr.out.enable_hmem);
-        LOG_INFO("  mnic_type: ", attr.out.mnic_type);
-        LOG_INFO("  mnic_count: ", attr.out.mnic_count);
-        LOG_INFO("  tag_bits: ", attr.out.tag_bits);
-        LOG_INFO("  max_tag: ", attr.out.max_tag);
-        LOG_INFO("  max_order_waw_size: ", attr.out.max_order_waw_size);
-    }
-
-    if ((!pmi) || (pmi && pmi->get_local_thread_idx() == 0)) {
-        if (!executor->are_workers_started()) {
-            atl_proc_coord_t* coord = atl_get_proc_coord();
-            if (rank < coord->local_count)
-                LOG_INFO("start workers for local process [",
-                         coord->local_idx,
-                         ":",
-                         coord->local_count,
-                         "]");
-            executor->start_workers(coord->local_idx, coord->local_count);
-        }
-    }
-}
-
-atl_wrapper::~atl_wrapper() {
-    static std::mutex memory_mutex;
-    std::lock_guard<std::mutex> lock(memory_mutex);
-    transports.remove(transport);
-    tag.reset();
-}
diff --git a/src/atl/atl_wrapper.h b/src/atl/atl_wrapper.h
deleted file mode 100644
index 4c5ec38e1..000000000
--- a/src/atl/atl_wrapper.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <list>
-#include <vector>
-
-#include "atl.h"
-#include "common/comm/atl_tag.hpp"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h"
-
-class ccl_executor;
-
-class atl_wrapper {
-public:
-    static void set_internal_env(const atl_attr_t& attr);
-    static void set_exec(ccl_executor* exec);
-
-    ~atl_wrapper();
-    atl_wrapper();
-    atl_wrapper(std::shared_ptr<ikvs_wrapper> k);
-    atl_wrapper(int total_rank_count,
-                const std::vector<int>& ranks,
-                std::shared_ptr<ikvs_wrapper> k);
-
-    atl_status_t atl_main_addr_reserve(char* main_addr) {
-        if (!pmi)
-            return ATL_STATUS_UNSUPPORTED;
-
-        return pmi->pmrt_main_addr_reserve(main_addr);
-        ;
-    }
-
-    atl_status_t atl_finalize() {
-        if (pmi)
-            pmi->pmrt_finalize();
-
-        return transport->atl_finalize();
-    }
-
-    atl_status_t atl_update() {
-        return transport->atl_update(pmi);
-    }
-
-    atl_status_t atl_wait_notification() {
-        if (!pmi)
-            return ATL_STATUS_UNSUPPORTED;
-
-        return pmi->pmrt_wait_notification();
-    }
-
-    atl_status_t atl_set_resize_function(atl_resize_fn_t fn) {
-        if (!pmi)
-            return ATL_STATUS_UNSUPPORTED;
-
-        return pmi->pmrt_set_resize_function(fn);
-    }
-
-    atl_proc_coord_t* atl_get_proc_coord() {
-        return transport->atl_get_proc_coord();
-    }
-
-    atl_status_t atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
-        return transport->atl_mr_reg(buf, len, mr);
-    }
-
-    atl_status_t atl_mr_dereg(atl_mr_t* mr) {
-        return transport->atl_mr_dereg(mr);
-    }
-
-    atl_status_t atl_ep_send(size_t ep_idx,
-                             const void* buf,
-                             size_t len,
-                             int dst_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) {
-        return transport->atl_ep_send(eps[ep_idx], buf, len, dst_proc_idx, tag, req);
-    }
-
-    atl_status_t atl_ep_recv(size_t ep_idx,
-                             void* buf,
-                             size_t len,
-                             int src_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) {
-        return transport->atl_ep_recv(eps[ep_idx], buf, len, src_proc_idx, tag, req);
-    }
-
-    atl_status_t atl_ep_probe(size_t ep_idx,
-                              int src_proc_idx,
-                              uint64_t tag,
-                              int* found,
-                              size_t* recv_len) {
-        return transport->atl_ep_probe(eps[ep_idx], src_proc_idx, tag, found, recv_len);
-    }
-
-    atl_status_t atl_ep_allgatherv(size_t ep_idx,
-                                   const void* send_buf,
-                                   size_t send_len,
-                                   void* recv_buf,
-                                   const int* recv_lens,
-                                   const int* offsets,
-                                   atl_req_t* req) {
-        return transport->atl_ep_allgatherv(
-            eps[ep_idx], send_buf, send_len, recv_buf, recv_lens, offsets, req);
-    }
-
-    atl_status_t atl_ep_allreduce(size_t ep_idx,
-                                  const void* send_buf,
-                                  void* recv_buf,
-                                  size_t len,
-                                  atl_datatype_t dtype,
-                                  atl_reduction_t op,
-                                  atl_req_t* req) {
-        return transport->atl_ep_allreduce(eps[ep_idx], send_buf, recv_buf, len, dtype, op, req);
-    }
-
-    atl_status_t atl_ep_alltoall(size_t ep_idx,
-                                 const void* send_buf,
-                                 void* recv_buf,
-                                 int len,
-                                 atl_req_t* req) {
-        return transport->atl_ep_alltoall(eps[ep_idx], send_buf, recv_buf, len, req);
-    }
-
-    atl_status_t atl_ep_alltoallv(size_t ep_idx,
-                                  const void* send_buf,
-                                  const int* send_lens,
-                                  const int* send_offsets,
-                                  void* recv_buf,
-                                  const int* recv_lens,
-                                  const int* recv_offsets,
-                                  atl_req_t* req) {
-        return transport->atl_ep_alltoallv(
-            eps[ep_idx], send_buf, send_lens, send_offsets, recv_buf, recv_lens, recv_offsets, req);
-    }
-
-    atl_status_t atl_ep_barrier(size_t ep_idx, atl_req_t* req) {
-        return transport->atl_ep_barrier(eps[ep_idx], req);
-    }
-
-    atl_status_t atl_ep_bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t* req) {
-        return transport->atl_ep_bcast(eps[ep_idx], buf, len, root, req);
-    }
-
-    atl_status_t atl_ep_reduce(size_t ep_idx,
-                               const void* send_buf,
-                               void* recv_buf,
-                               size_t len,
-                               int root,
-                               atl_datatype_t dtype,
-                               atl_reduction_t op,
-                               atl_req_t* req) {
-        return transport->atl_ep_reduce(eps[ep_idx], send_buf, recv_buf, len, root, dtype, op, req);
-    }
-
-    atl_status_t atl_ep_reduce_scatter(size_t ep_idx,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t recv_len,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) {
-        return transport->atl_ep_reduce_scatter(
-            eps[ep_idx], send_buf, recv_buf, recv_len, dtype, op, req);
-    }
-
-    atl_status_t atl_ep_read(size_t ep_idx,
-                             void* buf,
-                             size_t len,
-                             atl_mr_t* mr,
-                             uint64_t addr,
-                             uintptr_t remote_key,
-                             int dst_proc_idx,
-                             atl_req_t* req) {
-        return transport->atl_ep_read(
-            eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
-    }
-
-    atl_status_t atl_ep_write(size_t ep_idx,
-                              const void* buf,
-                              size_t len,
-                              atl_mr_t* mr,
-                              uint64_t addr,
-                              uintptr_t remote_key,
-                              int dst_proc_idx,
-                              atl_req_t* req) {
-        return transport->atl_ep_write(
-            eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
-    }
-
-    atl_status_t atl_ep_wait(size_t ep_idx, atl_req_t* req) {
-        return transport->atl_ep_wait(eps[ep_idx], req);
-    }
-
-    atl_status_t atl_ep_wait_all(size_t ep_idx, atl_req_t* req, size_t count) {
-        return transport->atl_ep_wait_all(eps[ep_idx], req, count);
-    }
-
-    atl_status_t atl_ep_cancel(size_t ep_idx, atl_req_t* req) {
-        return transport->atl_ep_cancel(eps[ep_idx], req);
-    }
-
-    atl_status_t atl_ep_poll(size_t ep_idx) {
-        return transport->atl_ep_poll(eps[ep_idx]);
-    }
-
-    atl_status_t atl_ep_check(size_t ep_idx, int* is_completed, atl_req_t* req) {
-        return transport->atl_ep_check(eps[ep_idx], is_completed, req);
-    }
-
-    size_t get_threads_per_process() {
-        return threads_per_process;
-    }
-
-    size_t get_ranks_per_process() {
-        return ranks_per_process;
-    }
-
-    int get_rank() {
-        return rank;
-    }
-
-    int get_size() {
-        return size;
-    }
-
-    int get_r2r_color() {
-        return transport->atl_get_proc_coord()->local_idx;
-    }
-
-    int get_host_color() {
-        return transport->atl_get_proc_coord()->hostname_hash;
-    }
-
-    /*
-     * TODO: Temporary change.
-     * Need to define correct to unique id
-     */
-    size_t get_id() {
-        return 0;
-    }
-
-    /* static ATL attr for all transport instances
-       actual values generated by executor */
-    static atl_attr_t attr;
-
-    std::unique_ptr<ccl_atl_tag> tag;
-
-private:
-    int rank;
-    int size;
-
-    size_t threads_per_process;
-    size_t ranks_per_process;
-
-    std::shared_ptr<iatl> transport;
-    std::unique_ptr<ipmi> pmi;
-    atl_ep_t** eps = nullptr;
-
-    void init_transport();
-};
diff --git a/src/atl/mpi/atl_mpi.cpp b/src/atl/mpi/atl_mpi.cpp
index 2e742d80e..0670b44f6 100644
--- a/src/atl/mpi/atl_mpi.cpp
+++ b/src/atl/mpi/atl_mpi.cpp
@@ -15,187 +15,838 @@
 */
 #ifdef CCL_ENABLE_MPI
 
+#include "atl_def.h"
 #include "atl_mpi.hpp"
-#include "atl_mpi_impl.cpp"
 
-atl_status_t atl_mpi::atl_set_env(const atl_attr_t& attr) {
-    return atl_mpi_set_env(attr);
-}
+#define MPI_BFLOAT16 \
+    ({ \
+        CCL_THROW_IF_NOT(global_data.bf16.dtype != MPI_DATATYPE_NULL, \
+                         "unsupported datatype: ATL_DTYPE_BF16"); \
+        global_data.bf16.dtype; \
+    })
+
+#define MPI_FLOAT16 \
+    ({ \
+        CCL_THROW_IF_NOT(global_data.fp16.dtype != MPI_DATATYPE_NULL, \
+                         "unsupported datatype: ATL_DTYPE_FP16"); \
+        global_data.fp16.dtype; \
+    })
+
+#define RET2ATL(ret) (ret != MPI_SUCCESS) ? ATL_STATUS_FAILURE : ATL_STATUS_SUCCESS
 
-atl_status_t atl_mpi::atl_init(int* argc,
-                               char*** argv,
-                               atl_attr_t* attr,
-                               const char* main_addr,
-                               std::unique_ptr<ipmi>& pmi) {
+atl_mpi_global_data atl_mpi::global_data{};
+
+atl_status_t atl_mpi::init(int* argc,
+                           char*** argv,
+                           atl_attr_t* attr,
+                           const char* main_addr,
+                           std::shared_ptr<ipmi> pmi) {
     inited = true;
-    return atl_mpi_init(argc, argv, attr, &ctx, main_addr, pmi.get());
+    CCL_THROW_IF_NOT((sizeof(atl_mpi_req_t) <= sizeof(atl_req_t) - offsetof(atl_req_t, internal)),
+                     "unexpected offset: atl_mpi_request size ",
+                     sizeof(atl_mpi_req_t),
+                     ", atl_request size ",
+                     sizeof(atl_req_t),
+                     ", expected offset ",
+                     offsetof(atl_req_t, internal));
+
+    int ret = MPI_SUCCESS;
+    int is_tag_ub_set = 0;
+    void* tag_ub_ptr = NULL;
+    int required_thread_level = MPI_THREAD_MULTIPLE, provided_thread_level;
+
+    if (global_data.ctx_count == 0) {
+        if (global_data.set_env(*attr)) {
+            goto err_init;
+        }
+
+        MPI_Initialized(&global_data.is_external_init);
+
+        if (!global_data.is_external_init) {
+            ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level);
+            if (provided_thread_level < required_thread_level) {
+                LOG_ERROR("unexpected MPI thread level: required ",
+                          required_thread_level,
+                          ", provided ",
+                          provided_thread_level);
+                goto err_init;
+            }
+        }
+        else {
+            LOG_DEBUG("MPI was initialized externaly");
+            MPI_Query_thread(&provided_thread_level);
+            if (provided_thread_level < required_thread_level) {
+                LOG_WARN("MPI was initialized externaly but with unexpected thread level: "
+                         "required ",
+                         required_thread_level,
+                         ", provided ",
+                         provided_thread_level);
+            }
+        }
+
+        if (ret)
+            goto err_init;
+
+        if (global_data.update_global_data(attr) == ATL_STATUS_FAILURE) {
+            goto err_init;
+        }
+    }
+    global_data.ctx_count++;
+
+    coord_update(MPI_COMM_WORLD, global_coord);
+
+    ep_count = attr->in.ep_count;
+
+    char* progress_mode_env;
+    progress_mode_env = getenv(ATL_PROGRESS_MODE_ENV);
+    if (progress_mode_env) {
+        progress_mode = (atl_progress_mode_t)atoi(progress_mode_env);
+    }
+    else {
+        progress_mode = ATL_PROGRESS_CHECK;
+    }
+    sync_coll = attr->in.enable_sync_coll;
+
+    if (global_coord.global_idx == 0) {
+        global_data.print_log_info();
+        LOG_INFO("atl-mpi-ctx: ", (global_data.ctx_count - 1));
+        LOG_INFO("  progress_mode: ", progress_mode);
+        LOG_INFO("  sync_coll: ", sync_coll);
+    }
+
+    MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_ptr, &is_tag_ub_set);
+
+    /* report actual attributes back to upper level */
+    attr->out.enable_shm = 0;
+    attr->out.enable_rma = 0;
+    attr->out.enable_hmem = attr->in.enable_hmem & global_data.mpi_lib_attr.hmem;
+    attr->out.mnic_type = global_data.mnic_type;
+    attr->out.mnic_count = global_data.mnic_count;
+    attr->out.tag_bits = 32;
+    attr->out.max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0;
+    attr->out.max_order_waw_size = 0;
+
+    return ATL_STATUS_SUCCESS;
+
+err_init:
+    return ATL_STATUS_FAILURE;
+}
+
+void atl_mpi::coord_update(MPI_Comm base_comm, atl_proc_coord_t& coord) {
+    MPI_Comm_rank(base_comm, (int*)&(coord.global_idx));
+    MPI_Comm_size(base_comm, (int*)&(coord.global_count));
+
+    MPI_Comm local_comm;
+    MPI_Comm_split_type(
+        base_comm, MPI_COMM_TYPE_SHARED, coord.global_count, MPI_INFO_NULL, &local_comm);
+    MPI_Comm_rank(local_comm, (int*)&(coord.local_idx));
+    MPI_Comm_size(local_comm, (int*)&(coord.local_count));
+    MPI_Comm_free(&local_comm);
+
+    char my_hostname[ATL_MAX_HOSTNAME_LEN] = { 0 };
+    gethostname(my_hostname, ATL_MAX_HOSTNAME_LEN - 1);
+    coord.hostname_hash = std::hash<std::string>{}(my_hostname);
+}
+
+void atl_mpi::comms_free(std::vector<atl_mpi_ep_t>& eps) {
+    for (size_t i = 0; i < eps.size(); i++) {
+        atl_mpi_ep_t& mpi_ep = eps[i];
+
+        if (progress_mode == ATL_PROGRESS_POLL) {
+            MPI_Cancel(&(mpi_ep.dummy_req.native_req));
+            MPI_Comm_free(&mpi_ep.dummy_comm);
+        }
+        MPI_Comm_free(&mpi_ep.mpi_comm);
+    }
 }
 
-atl_status_t atl_mpi::atl_finalize() {
+atl_status_t atl_mpi::finalize() {
     is_finalized = true;
-    return atl_mpi_finalize(ctx);
+
+    int ret = MPI_SUCCESS;
+
+    global_data.ctx_count--;
+    if (global_coord.global_idx == 0) {
+        LOG_INFO("finalize atl-mpi ctx, remaining ctx_count ", global_data.ctx_count);
+    }
+
+    int is_mpi_finalized = 0;
+    MPI_Finalized(&is_mpi_finalized);
+
+    if (!is_mpi_finalized) {
+        if (global_data.ctx_count == 0) {
+            global_data.bf16_finalize();
+            global_data.fp16_finalize();
+            if (!global_data.is_external_init) {
+                ret = MPI_Finalize();
+            }
+            else {
+                LOG_DEBUG("MPI_Init has been called externally, skip MPI_Finalize");
+            }
+
+            if (global_coord.global_idx == 0) {
+                LOG_INFO("finalized last atl-mpi ctx");
+            }
+        }
+    }
+    else {
+        if ((global_data.ctx_count == 0) && (global_coord.global_idx == 0)) {
+            LOG_WARN("MPI_Finalize has been called before CCL finalization");
+        }
+    }
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_update(std::unique_ptr<ipmi>& pmi) {
+atl_status_t atl_mpi::update(std::shared_ptr<ipmi> pmi) {
     (void)pmi;
     return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_ep_t** atl_mpi::atl_get_eps() {
-    return ctx->eps;
+atl_status_t atl_mpi::mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
+    return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_proc_coord_t* atl_mpi::atl_get_proc_coord() {
-    return &(ctx->coord);
+atl_status_t atl_mpi::mr_dereg(atl_mr_t* mr) {
+    return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_status_t atl_mpi::atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
-    return atl_mpi_mr_reg(ctx, buf, len, mr);
-}
+atl_status_t atl_mpi::send(atl_mpi_ep_t& ep,
+                           const void* buf,
+                           size_t len,
+                           int dst_proc_idx,
+                           uint64_t tag,
+                           atl_req_t* req) {
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-atl_status_t atl_mpi::atl_mr_dereg(atl_mr_t* mr) {
-    return atl_mpi_mr_dereg(ctx, mr);
-}
+    init_req(req);
+
+    int ret =
+        MPI_Isend(buf, len, MPI_CHAR, dst_proc_idx, (int)tag, ep.mpi_comm, &mpi_req->native_req);
 
-atl_status_t atl_mpi::atl_ep_send(atl_ep_t* ep,
-                                  const void* buf,
-                                  size_t len,
-                                  int dst_proc_idx,
-                                  uint64_t tag,
-                                  atl_req_t* req) {
-    return atl_mpi_ep_send(ep, buf, len, dst_proc_idx, tag, req);
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_recv(atl_ep_t* ep,
-                                  void* buf,
-                                  size_t len,
-                                  int src_proc_idx,
-                                  uint64_t tag,
-                                  atl_req_t* req) {
-    return atl_mpi_ep_recv(ep, buf, len, src_proc_idx, tag, req);
+atl_status_t atl_mpi::recv(atl_mpi_ep_t& ep,
+                           void* buf,
+                           size_t len,
+                           int src_proc_idx,
+                           uint64_t tag,
+                           atl_req_t* req) {
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    int ret =
+        MPI_Irecv(buf, len, MPI_CHAR, src_proc_idx, (int)tag, ep.mpi_comm, &mpi_req->native_req);
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_probe(atl_ep_t* ep,
-                                   int src_proc_idx,
-                                   uint64_t tag,
-                                   int* found,
-                                   size_t* recv_len) {
-    return atl_mpi_ep_probe(ep, src_proc_idx, tag, found, recv_len);
+atl_status_t atl_mpi::probe(atl_mpi_ep_t& ep,
+                            int src_proc_idx,
+                            uint64_t tag,
+                            int* found,
+                            size_t* recv_len) {
+    int flag = 0, len = 0, ret;
+    MPI_Status status;
+
+    ret = MPI_Iprobe(src_proc_idx, tag, ep.mpi_comm, &flag, &status);
+    if (flag) {
+        MPI_Get_count(&status, MPI_BYTE, &len);
+    }
+
+    if (found)
+        *found = flag;
+    if (recv_len)
+        *recv_len = len;
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_allgatherv(atl_ep_t* ep,
-                                        const void* send_buf,
-                                        size_t send_len,
-                                        void* recv_buf,
-                                        const int* recv_lens,
-                                        const int* offsets,
-                                        atl_req_t* req) {
-    return atl_mpi_ep_allgatherv(ep, send_buf, send_len, recv_buf, recv_lens, offsets, req);
+atl_status_t atl_mpi::allgatherv(atl_mpi_ep_t& ep,
+                                 const void* send_buf,
+                                 size_t send_len,
+                                 void* recv_buf,
+                                 const int* recv_lens,
+                                 const int* offsets,
+                                 atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Allgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                             send_len,
+                             MPI_CHAR,
+                             recv_buf,
+                             recv_lens,
+                             offsets,
+                             MPI_CHAR,
+                             ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Iallgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                              send_len,
+                              MPI_CHAR,
+                              recv_buf,
+                              recv_lens,
+                              offsets,
+                              MPI_CHAR,
+                              ep.mpi_comm,
+                              &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_allreduce(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t len,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) {
-    return atl_mpi_ep_allreduce(ep, send_buf, recv_buf, len, dtype, op, req);
+atl_status_t atl_mpi::allreduce(atl_mpi_ep_t& ep,
+                                const void* send_buf,
+                                void* recv_buf,
+                                size_t len,
+                                atl_datatype_t dtype,
+                                atl_reduction_t op,
+                                atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
+    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Allreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                            recv_buf,
+                            len,
+                            mpi_dtype,
+                            mpi_op,
+                            ep.mpi_comm);
+    }
+    else {
+        //printf("atl_mpi: send_buf %p, recv_buf %p\n", send_buf, recv_buf);
+        ret = MPI_Iallreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                             recv_buf,
+                             len,
+                             mpi_dtype,
+                             mpi_op,
+                             ep.mpi_comm,
+                             &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_alltoall(atl_ep_t* ep,
-                                      const void* send_buf,
-                                      void* recv_buf,
-                                      int len,
-                                      atl_req_t* req) {
-    return atl_mpi_ep_alltoall(ep, send_buf, recv_buf, len, req);
+atl_status_t atl_mpi::alltoall(atl_mpi_ep_t& ep,
+                               const void* send_buf,
+                               void* recv_buf,
+                               int len,
+                               atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Alltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                           len,
+                           MPI_CHAR,
+                           recv_buf,
+                           len,
+                           MPI_CHAR,
+                           ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ialltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                            len,
+                            MPI_CHAR,
+                            recv_buf,
+                            len,
+                            MPI_CHAR,
+                            ep.mpi_comm,
+                            &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_alltoallv(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       const int* send_lens,
-                                       const int* send_offsets,
-                                       void* recv_buf,
-                                       const int* recv_lens,
-                                       const int* recv_offsets,
-                                       atl_req_t* req) {
-    return atl_mpi_ep_alltoallv(
-        ep, send_buf, send_lens, send_offsets, recv_buf, recv_lens, recv_offsets, req);
+atl_status_t atl_mpi::alltoallv(atl_mpi_ep_t& ep,
+                                const void* send_buf,
+                                const int* send_lens,
+                                const int* send_offsets,
+                                void* recv_buf,
+                                const int* recv_lens,
+                                const int* recv_offsets,
+                                atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Alltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                            send_lens,
+                            send_offsets,
+                            MPI_CHAR,
+                            recv_buf,
+                            recv_lens,
+                            recv_offsets,
+                            MPI_CHAR,
+                            ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ialltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                             send_lens,
+                             send_offsets,
+                             MPI_CHAR,
+                             recv_buf,
+                             recv_lens,
+                             recv_offsets,
+                             MPI_CHAR,
+                             ep.mpi_comm,
+                             &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
-    return atl_mpi_ep_barrier(ep, req);
+atl_status_t atl_mpi::barrier(atl_mpi_ep_t& ep, atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Barrier(ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ibarrier(ep.mpi_comm, &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_bcast(atl_ep_t* ep, void* buf, size_t len, int root, atl_req_t* req) {
-    return atl_mpi_ep_bcast(ep, buf, len, root, req);
+atl_status_t atl_mpi::bcast(atl_mpi_ep_t& ep, void* buf, size_t len, int root, atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Bcast(buf, len, MPI_CHAR, root, ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ibcast(buf, len, MPI_CHAR, root, ep.mpi_comm, &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_reduce(atl_ep_t* ep,
-                                    const void* send_buf,
-                                    void* recv_buf,
-                                    size_t len,
-                                    int root,
-                                    atl_datatype_t dtype,
-                                    atl_reduction_t op,
-                                    atl_req_t* req) {
-    return atl_mpi_ep_reduce(ep, send_buf, recv_buf, len, root, dtype, op, req);
+atl_status_t atl_mpi::reduce(atl_mpi_ep_t& ep,
+                             const void* send_buf,
+                             void* recv_buf,
+                             size_t len,
+                             int root,
+                             atl_datatype_t dtype,
+                             atl_reduction_t op,
+                             atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    int my_proc_idx = ep.coord->global_idx;
+    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
+    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret = MPI_Reduce(
+            (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
+            recv_buf,
+            len,
+            mpi_dtype,
+            mpi_op,
+            root,
+            ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ireduce(
+            (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
+            recv_buf,
+            len,
+            mpi_dtype,
+            mpi_op,
+            root,
+            ep.mpi_comm,
+            &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_reduce_scatter(atl_ep_t* ep,
-                                            const void* send_buf,
-                                            void* recv_buf,
-                                            size_t recv_len,
-                                            atl_datatype_t dtype,
-                                            atl_reduction_t op,
-                                            atl_req_t* req) {
-    return atl_mpi_ep_reduce_scatter(ep, send_buf, recv_buf, recv_len, dtype, op, req);
+atl_status_t atl_mpi::reduce_scatter(atl_mpi_ep_t& ep,
+                                     const void* send_buf,
+                                     void* recv_buf,
+                                     size_t recv_len,
+                                     atl_datatype_t dtype,
+                                     atl_reduction_t op,
+                                     atl_req_t* req) {
+    int ret = MPI_SUCCESS;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
+    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
+
+    init_req(req);
+
+    if (sync_coll) {
+        ret =
+            MPI_Reduce_scatter_block((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                                     recv_buf,
+                                     recv_len,
+                                     mpi_dtype,
+                                     mpi_op,
+                                     ep.mpi_comm);
+    }
+    else {
+        ret = MPI_Ireduce_scatter_block(
+            (send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+            recv_buf,
+            recv_len,
+            mpi_dtype,
+            mpi_op,
+            ep.mpi_comm,
+            &mpi_req->native_req);
+    }
+
+    check_ep(ep);
+
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_read(atl_ep_t* ep,
-                                  void* buf,
-                                  size_t len,
-                                  atl_mr_t* mr,
-                                  uint64_t addr,
-                                  uintptr_t remote_key,
-                                  int dst_proc_idx,
-                                  atl_req_t* req) {
-    return atl_mpi_ep_read(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
+atl_status_t atl_mpi::read(atl_mpi_ep_t& ep,
+                           void* buf,
+                           size_t len,
+                           atl_mr_t* mr,
+                           uint64_t addr,
+                           uintptr_t remote_key,
+                           int dst_proc_idx,
+                           atl_req_t* req) {
+    return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_status_t atl_mpi::atl_ep_write(atl_ep_t* ep,
-                                   const void* buf,
-                                   size_t len,
-                                   atl_mr_t* mr,
-                                   uint64_t addr,
-                                   uintptr_t remote_key,
-                                   int dst_proc_idx,
-                                   atl_req_t* req) {
-    return atl_mpi_ep_write(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
+atl_status_t atl_mpi::write(atl_mpi_ep_t& ep,
+                            const void* buf,
+                            size_t len,
+                            atl_mr_t* mr,
+                            uint64_t addr,
+                            uintptr_t remote_key,
+                            int dst_proc_idx,
+                            atl_req_t* req) {
+    return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_status_t atl_mpi::atl_ep_wait(atl_ep_t* ep, atl_req_t* req) {
-    return atl_mpi_ep_wait(ep, req);
+atl_status_t atl_mpi::wait(atl_mpi_ep_t& ep, atl_req_t* req) {
+    int ret;
+    MPI_Status status;
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+    ret = MPI_Wait(&mpi_req->native_req, &status);
+    mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
+    return RET2ATL(ret);
 }
 
-atl_status_t atl_mpi::atl_ep_wait_all(atl_ep_t* ep, atl_req_t* req, size_t count) {
-    return atl_mpi_ep_wait_all(ep, req, count);
+atl_status_t atl_mpi::wait_all(atl_mpi_ep_t& ep, atl_req_t* req, size_t count) {
+    return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_status_t atl_mpi::atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) {
+atl_status_t atl_mpi::cancel(atl_mpi_ep_t& ep, atl_req_t* req) {
     return ATL_STATUS_UNSUPPORTED;
 }
 
-atl_status_t atl_mpi::atl_ep_poll(atl_ep_t* ep) {
-    return atl_mpi_ep_poll(ep);
+atl_status_t atl_mpi::poll(atl_mpi_ep_t& ep) {
+    if (progress_mode == ATL_PROGRESS_POLL) {
+        return ep_progress(ep, &(ep.dummy_req));
+    }
+
+    return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t atl_mpi::atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) {
-    return atl_mpi_ep_check(ep, is_completed, req);
+atl_status_t atl_mpi::check(atl_mpi_ep_t& ep, atl_req_t* req) {
+    atl_status_t status;
+
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+
+    CCL_THROW_IF_NOT(!req->is_completed, "request is already completed");
+    CCL_THROW_IF_NOT(mpi_req->comp_state == ATL_MPI_COMP_POSTED, "request is already completed");
+
+    if (mpi_req->native_req == MPI_REQUEST_NULL) {
+        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
+    }
+
+    req->is_completed = (mpi_req->comp_state == ATL_MPI_COMP_COMPLETED);
+    if (req->is_completed) {
+        return ATL_STATUS_SUCCESS;
+    }
+
+    status = ep_progress(ep, mpi_req);
+    req->is_completed = (mpi_req->comp_state == ATL_MPI_COMP_COMPLETED);
+
+    return status;
 }
+
 atl_mpi::~atl_mpi() {
     if (!is_finalized)
-        atl_finalize();
+        finalize();
+}
+
+MPI_Datatype atl_mpi::atl2mpi_dtype(atl_datatype_t dtype) {
+    switch (dtype) {
+        case ATL_DTYPE_INT8: return MPI_CHAR;
+        case ATL_DTYPE_UINT8: return MPI_UNSIGNED_CHAR;
+        case ATL_DTYPE_INT16: return MPI_INT16_T;
+        case ATL_DTYPE_UINT16: return MPI_UINT16_T;
+        case ATL_DTYPE_INT32: return MPI_INT;
+        case ATL_DTYPE_UINT32: return MPI_UINT32_T;
+        case ATL_DTYPE_INT64: return MPI_LONG_LONG;
+        case ATL_DTYPE_UINT64: return MPI_UNSIGNED_LONG_LONG;
+        case ATL_DTYPE_FLOAT16: return MPI_FLOAT16;
+        case ATL_DTYPE_FLOAT32: return MPI_FLOAT;
+        case ATL_DTYPE_FLOAT64: return MPI_DOUBLE;
+        case ATL_DTYPE_BFLOAT16: return MPI_BFLOAT16;
+        default: printf("unknown datatype: %d\n", dtype); exit(1);
+    }
+}
+
+inline atl_status_t atl_mpi::ep_progress(atl_mpi_ep_t& ep, atl_mpi_req_t* req) {
+    int flag = 0;
+    int ret = MPI_Test(&req->native_req, &flag, MPI_STATUS_IGNORE);
+
+    if (flag) {
+        req->comp_state = ATL_MPI_COMP_COMPLETED;
+    }
+
+    return RET2ATL(ret);
+}
+
+void atl_mpi::init_req(atl_req_t* req) {
+    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
+    mpi_req->native_req = MPI_REQUEST_NULL;
+    mpi_req->comp_state = ATL_MPI_COMP_POSTED;
+    req->is_completed = 0;
+}
+
+MPI_Op atl_mpi::atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype) {
+#ifdef ATL_MPI_BF16
+    if (dtype == global_data.bf16.dtype)
+        return global_data.atl2mpi_op_bf16(rtype);
+#endif // ATL_MPI_BF16
+
+#ifdef ATL_MPI_FP16
+    if (dtype == global_data.fp16.dtype)
+        return global_data.atl2mpi_op_fp16(rtype);
+#endif // ATL_MPI_FP16
+
+    (void)dtype;
+    switch (rtype) {
+        case ATL_REDUCTION_SUM: return MPI_SUM;
+        case ATL_REDUCTION_PROD: return MPI_PROD;
+        case ATL_REDUCTION_MIN: return MPI_MIN;
+        case ATL_REDUCTION_MAX: return MPI_MAX;
+        default: printf("unknown reduction type: %d\n", rtype); exit(1);
+    }
+}
+
+size_t atl_mpi::get_ep_idx(size_t ep_idx) {
+    size_t mpi_ep_idx = ep_idx;
+    if (global_data.extra_ep)
+        mpi_ep_idx += global_data.extra_ep;
+    return mpi_ep_idx;
+}
+
+atl_status_t atl_mpi::ep_init(std::vector<atl_mpi_ep_t>& eps) {
+    atl_mpi_ep_t base_ep;
+    base_ep.mpi_comm = MPI_COMM_WORLD;
+    base_ep.dummy_comm = MPI_COMM_WORLD;
+    base_ep.idx = 0;
+    base_ep.coord = nullptr;
+    std::vector<atl_mpi_ep_t> base_eps(ep_count, base_ep);
+    return comm_split(base_eps, eps, 0);
+}
+
+#ifdef ENABLE_DEBUG
+void atl_mpi::check_ep(atl_mpi_ep_t& ep) {
+    check_comm_ep_idx(ep.mpi_comm, get_ep_idx(ep.idx));
+}
+#endif // ENABLE_DEBUG
+
+void atl_mpi::check_comm_nic_idx(MPI_Comm comm, size_t expected_idx) {
+    char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
+    snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx);
+    check_comm_info(comm, global_data.NIC_IDX_KEY, expected_idx_str);
+}
+
+void atl_mpi::check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) {
+    if (global_data.mpi_lib_attr.type == global_data.ATL_MPI_LIB_NONE)
+        return;
+
+    char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
+    snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx);
+    check_comm_info(comm, global_data.EP_IDX_KEY, expected_idx_str);
+}
+
+void atl_mpi::check_comm_info(MPI_Comm comm, const char* key, const char* expected_value) {
+    atl_mpi_comm_info_t info = atl_mpi::get_comm_info(comm, key);
+
+    CCL_THROW_IF_NOT(info.found, "MPI comm key ", key, " was not set");
+    CCL_THROW_IF_NOT(!strcmp(info.value, expected_value),
+                     "MPI comm key ",
+                     key,
+                     ": expected: ",
+                     expected_value,
+                     ", read: ",
+                     info.value);
+}
+
+void atl_mpi::set_env(const atl_attr_t& attr) {
+    global_data.set_env(attr);
+}
+
+atl_status_t atl_mpi::comm_split(const std::vector<atl_mpi_ep_t>& base_eps,
+                                 std::vector<atl_mpi_ep_t>& eps,
+                                 size_t color) {
+    int ret;
+    atl_mpi_ep_t ep;
+    for (size_t idx = 0; idx < ep_count; idx++) {
+        ssize_t mpi_ep_idx = get_ep_idx(idx);
+        char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 };
+
+        size_t nic_idx = 0;
+        char nic_idx_str[MPI_MAX_INFO_VAL] = { 0 };
+
+        ret = MPI_Comm_split(base_eps[idx].mpi_comm, color, 0, &ep.mpi_comm);
+        if (ret) {
+            LOG_ERROR("MPI_Comm_split error, ep_idx ", idx);
+            break;
+        }
+
+        MPI_Info info;
+        MPI_Info_create(&info);
+
+        /* set EP index */
+        snprintf(mpi_ep_idx_str, MPI_MAX_INFO_VAL, "%zu", mpi_ep_idx);
+        MPI_Info_set(info, global_data.EP_IDX_KEY, mpi_ep_idx_str);
+
+        if (global_data.mnic_type != ATL_MNIC_NONE) {
+            /* set NIC index */
+            nic_idx = idx;
+            if (global_data.mnic_offset == ATL_MNIC_OFFSET_LOCAL_PROC_IDX) {
+                nic_idx += global_coord.local_idx;
+            }
+            nic_idx %= global_data.mnic_count;
+            snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx);
+            MPI_Info_set(info, global_data.NIC_IDX_KEY, nic_idx_str);
+
+            LOG_INFO("select nic: ep_idx ",
+                     idx,
+                     ", local_proc_idx ",
+                     global_coord.local_idx,
+                     ", nic_idx ",
+                     nic_idx);
+        }
+
+        MPI_Comm_set_info(ep.mpi_comm, info);
+
+        if (progress_mode == ATL_PROGRESS_POLL) {
+            ret = MPI_Comm_split(base_eps[idx].dummy_comm, color, 0, &ep.dummy_comm);
+            if (ret) {
+                LOG_ERROR("MPI_Comm_split error, ep_idx ", idx);
+                break;
+            }
+            MPI_Comm_set_info(ep.dummy_comm, info);
+            MPI_Irecv(NULL, 0, MPI_CHAR, 0, 0, ep.dummy_comm, &(ep.dummy_req.native_req));
+
+            check_comm_ep_idx(ep.dummy_comm, mpi_ep_idx);
+            if (global_data.mnic_type != ATL_MNIC_NONE) {
+                check_comm_nic_idx(ep.dummy_comm, nic_idx);
+            }
+        }
+
+        MPI_Info_free(&info);
+
+        check_comm_ep_idx(ep.mpi_comm, mpi_ep_idx);
+        if (global_data.mnic_type != ATL_MNIC_NONE) {
+            check_comm_nic_idx(ep.mpi_comm, nic_idx);
+        }
+
+        LOG_DEBUG("atl-mpi-ep: ", idx, ", ep_idx ", mpi_ep_idx, ", nic_idx ", nic_idx);
+
+        ep.idx = idx;
+        eps.push_back(ep);
+    }
+
+    if (ret) {
+        comms_free(eps);
+        global_data.ctx_count--;
+        if (global_data.ctx_count == 0) {
+            global_data.bf16_finalize();
+            global_data.fp16_finalize();
+            if (!global_data.is_external_init) {
+                MPI_Finalize();
+            }
+        }
+    }
+
+    return RET2ATL(ret);
+}
+
+atl_mpi_env_info_t atl_mpi::get_env_info(const char* key) {
+    atl_mpi_env_info_t res;
+    snprintf(res.key, MPI_MAX_INFO_KEY, "%s", key);
+    MPI_Info_get(MPI_INFO_ENV, key, MPI_MAX_INFO_VAL, res.value, &res.found);
+    return res;
+}
+
+atl_mpi_comm_info_t atl_mpi::get_comm_info(MPI_Comm comm, const char* key) {
+    MPI_Info info;
+    atl_mpi_comm_info_t res;
+
+    res.comm = comm;
+    snprintf(res.key, MPI_MAX_INFO_KEY, "%s", key);
+
+    MPI_Comm_get_info(res.comm, &info);
+    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, res.value, &res.found);
+    MPI_Info_free(&info);
+
+    return res;
 }
 
 #endif // CCL_ENABLE_MPI
diff --git a/src/atl/mpi/atl_mpi.hpp b/src/atl/mpi/atl_mpi.hpp
index 03760a0ae..e82997e27 100644
--- a/src/atl/mpi/atl_mpi.hpp
+++ b/src/atl/mpi/atl_mpi.hpp
@@ -13,153 +13,212 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#pragma once
 #ifdef CCL_ENABLE_MPI
+#include <mpi.h>
 
-#include "atl.h"
+#include "atl_mpi_global_data.hpp"
 
-class atl_mpi final : public iatl {
+typedef enum { ATL_MPI_COMP_POSTED, ATL_MPI_COMP_COMPLETED } atl_mpi_comp_state_t;
+
+typedef struct {
+    MPI_Request native_req;
+    atl_mpi_comp_state_t comp_state;
+} atl_mpi_req_t;
+
+typedef struct {
+    MPI_Comm mpi_comm;
+
+    /* dummy recv operation to ensure progress in atl_poll */
+    atl_mpi_req_t dummy_req;
+    MPI_Comm dummy_comm;
+    size_t idx;
+    atl_proc_coord_t* coord;
+} atl_mpi_ep_t;
+
+typedef struct atl_mpi_env_info {
+    int found;
+    char key[MPI_MAX_INFO_KEY];
+    char value[MPI_MAX_INFO_VAL];
+
+    atl_mpi_env_info() {
+        found = 0;
+        memset(key, 0, MPI_MAX_INFO_KEY);
+        memset(value, 0, MPI_MAX_INFO_VAL);
+    }
+} atl_mpi_env_info_t;
+
+typedef struct atl_mpi_comm_info : atl_mpi_env_info_t {
+    MPI_Comm comm;
+
+    atl_mpi_comm_info() {
+        comm = MPI_COMM_WORLD;
+    }
+} atl_mpi_comm_info_t;
+
+class atl_mpi {
 public:
     atl_mpi() = default;
-    ~atl_mpi() override;
-
-    static atl_status_t atl_set_env(const atl_attr_t& attr);
-
-    atl_status_t atl_init(int* argc,
-                          char*** argv,
-                          atl_attr_t* attr,
-                          const char* main_addr,
-                          std::unique_ptr<ipmi>& pmi) override;
-
-    atl_status_t atl_update(std::unique_ptr<ipmi>& pmi) override;
-
-    atl_ep_t** atl_get_eps() override;
-
-    atl_proc_coord_t* atl_get_proc_coord() override;
-
-    atl_status_t atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) override;
-
-    atl_status_t atl_mr_dereg(atl_mr_t* mr) override;
-
-    atl_status_t atl_ep_send(atl_ep_t* ep,
-                             const void* buf,
-                             size_t len,
-                             int dst_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_recv(atl_ep_t* ep,
-                             void* buf,
-                             size_t len,
-                             int src_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_probe(atl_ep_t* ep,
-                              int src_proc_idx,
-                              uint64_t tag,
-                              int* found,
-                              size_t* recv_len) override;
-
-    atl_status_t atl_ep_allgatherv(atl_ep_t* ep,
-                                   const void* send_buf,
-                                   size_t send_len,
-                                   void* recv_buf,
-                                   const int* recv_lens,
-                                   const int* offsets,
-                                   atl_req_t* req) override;
-
-    atl_status_t atl_ep_allreduce(atl_ep_t* ep,
-                                  const void* send_buf,
-                                  void* recv_buf,
-                                  size_t len,
-                                  atl_datatype_t dtype,
-                                  atl_reduction_t op,
-                                  atl_req_t* req) override;
-
-    atl_status_t atl_ep_alltoall(atl_ep_t* ep,
-                                 const void* send_buf,
-                                 void* recv_buf,
-                                 int len,
-                                 atl_req_t* req) override;
-
-    atl_status_t atl_ep_alltoallv(atl_ep_t* ep,
-                                  const void* send_buf,
-                                  const int* send_lens,
-                                  const int* send_offsets,
-                                  void* recv_buf,
-                                  const int* recv_lens,
-                                  const int* recv_offsets,
-                                  atl_req_t* req) override;
-
-    atl_status_t atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_bcast(atl_ep_t* ep,
-                              void* buf,
-                              size_t len,
-                              int root,
-                              atl_req_t* req) override;
-
-    atl_status_t atl_ep_reduce(atl_ep_t* ep,
-                               const void* send_buf,
-                               void* recv_buf,
-                               size_t len,
-                               int root,
-                               atl_datatype_t dtype,
-                               atl_reduction_t op,
-                               atl_req_t* req) override;
-
-    atl_status_t atl_ep_reduce_scatter(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t recv_len,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) override;
-
-    atl_status_t atl_ep_read(atl_ep_t* ep,
-                             void* buf,
-                             size_t len,
-                             atl_mr_t* mr,
-                             uint64_t addr,
-                             uintptr_t remote_key,
-                             int dst_proc_idx,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_write(atl_ep_t* ep,
-                              const void* buf,
-                              size_t len,
-                              atl_mr_t* mr,
-                              uint64_t addr,
-                              uintptr_t remote_key,
-                              int dst_proc_idx,
-                              atl_req_t* req) override;
-
-    atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_wait_all(atl_ep_t* ep, atl_req_t* req, size_t count) override;
-
-    atl_status_t atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_poll(atl_ep_t* ep) override;
-
-    atl_status_t atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) override;
-
-    atl_status_t atl_finalize() override;
+    ~atl_mpi();
+
+    atl_status_t init(int* argc,
+                      char*** argv,
+                      atl_attr_t* attr,
+                      const char* main_addr,
+                      std::shared_ptr<ipmi> pmi);
+
+    atl_status_t update(std::shared_ptr<ipmi> pmi);
+
+    atl_status_t mr_reg(const void* buf, size_t len, atl_mr_t** mr);
+
+    atl_status_t mr_dereg(atl_mr_t* mr);
+
+    atl_status_t send(atl_mpi_ep_t& ep,
+                      const void* buf,
+                      size_t len,
+                      int dst_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req);
+
+    atl_status_t recv(atl_mpi_ep_t& ep,
+                      void* buf,
+                      size_t len,
+                      int src_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req);
+
+    atl_status_t probe(atl_mpi_ep_t& ep,
+                       int src_proc_idx,
+                       uint64_t tag,
+                       int* found,
+                       size_t* recv_len);
+
+    atl_status_t allgatherv(atl_mpi_ep_t& ep,
+                            const void* send_buf,
+                            size_t send_len,
+                            void* recv_buf,
+                            const int* recv_lens,
+                            const int* offsets,
+                            atl_req_t* req);
+
+    atl_status_t allreduce(atl_mpi_ep_t& ep,
+                           const void* send_buf,
+                           void* recv_buf,
+                           size_t len,
+                           atl_datatype_t dtype,
+                           atl_reduction_t op,
+                           atl_req_t* req);
+
+    atl_status_t alltoall(atl_mpi_ep_t& ep,
+                          const void* send_buf,
+                          void* recv_buf,
+                          int len,
+                          atl_req_t* req);
+
+    atl_status_t alltoallv(atl_mpi_ep_t& ep,
+                           const void* send_buf,
+                           const int* send_lens,
+                           const int* send_offsets,
+                           void* recv_buf,
+                           const int* recv_lens,
+                           const int* recv_offsets,
+                           atl_req_t* req);
+
+    atl_status_t barrier(atl_mpi_ep_t& ep, atl_req_t* req);
+
+    atl_status_t bcast(atl_mpi_ep_t& ep, void* buf, size_t len, int root, atl_req_t* req);
+
+    atl_status_t reduce(atl_mpi_ep_t& ep,
+                        const void* send_buf,
+                        void* recv_buf,
+                        size_t len,
+                        int root,
+                        atl_datatype_t dtype,
+                        atl_reduction_t op,
+                        atl_req_t* req);
+
+    atl_status_t reduce_scatter(atl_mpi_ep_t& ep,
+                                const void* send_buf,
+                                void* recv_buf,
+                                size_t recv_len,
+                                atl_datatype_t dtype,
+                                atl_reduction_t op,
+                                atl_req_t* req);
+
+    atl_status_t read(atl_mpi_ep_t& ep,
+                      void* buf,
+                      size_t len,
+                      atl_mr_t* mr,
+                      uint64_t addr,
+                      uintptr_t remote_key,
+                      int dst_proc_idx,
+                      atl_req_t* req);
+
+    atl_status_t write(atl_mpi_ep_t& ep,
+                       const void* buf,
+                       size_t len,
+                       atl_mr_t* mr,
+                       uint64_t addr,
+                       uintptr_t remote_key,
+                       int dst_proc_idx,
+                       atl_req_t* req);
+
+    atl_status_t wait(atl_mpi_ep_t& ep, atl_req_t* req);
+
+    atl_status_t wait_all(atl_mpi_ep_t& ep, atl_req_t* req, size_t count);
+
+    atl_status_t cancel(atl_mpi_ep_t& ep, atl_req_t* req);
+
+    atl_status_t poll(atl_mpi_ep_t& ep);
+
+    atl_status_t check(atl_mpi_ep_t& ep, atl_req_t* req);
+
+    void comms_free(std::vector<atl_mpi_ep_t>& eps);
+
+    atl_status_t finalize();
 
     int get_rank() {
-        return ctx->coord.global_idx;
+        return global_coord.global_idx;
     }
     int get_size() {
-        return ctx->coord.global_count;
+        return global_coord.global_count;
     }
-    bool is_inited() override {
+    bool is_inited() {
         return inited;
     }
 
+    static void set_env(const atl_attr_t& attr);
+    void coord_update(MPI_Comm base_comm, atl_proc_coord_t& coord);
+    atl_status_t ep_init(std::vector<atl_mpi_ep_t>& eps);
+    atl_status_t comm_split(const std::vector<atl_mpi_ep_t>& base_eps,
+                            std::vector<atl_mpi_ep_t>& eps,
+                            size_t color);
+
+    static atl_mpi_env_info_t get_env_info(const char* key);
+    static atl_mpi_comm_info_t get_comm_info(MPI_Comm comm, const char* key);
+
 private:
-    atl_ctx_t* ctx = nullptr;
+    MPI_Datatype atl2mpi_dtype(atl_datatype_t dtype);
+    void init_req(atl_req_t* req);
+    inline atl_status_t ep_progress(atl_mpi_ep_t& ep, atl_mpi_req_t* req);
+    MPI_Op atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype);
+    void check_comm_nic_idx(MPI_Comm comm, size_t expected_idx);
+    void check_comm_ep_idx(MPI_Comm comm, size_t expected_idx);
+    void check_comm_info(MPI_Comm comm, const char* key, const char* expected_value);
+    size_t get_ep_idx(size_t ep_idx);
+
+#ifdef ENABLE_DEBUG
+    void check_ep(atl_mpi_ep_t& ep);
+#else
+#define check_ep(ep)
+#endif
+
     bool is_finalized{ false };
     bool inited{ false };
+    static atl_mpi_global_data global_data;
+    atl_progress_mode_t progress_mode;
+    bool sync_coll;
+    size_t ep_count;
+    atl_proc_coord_t global_coord;
 };
-
 #endif // CCL_ENABLE_MPI
diff --git a/src/atl/mpi/atl_mpi_comm.cpp b/src/atl/mpi/atl_mpi_comm.cpp
new file mode 100644
index 000000000..a7ed97ad8
--- /dev/null
+++ b/src/atl/mpi/atl_mpi_comm.cpp
@@ -0,0 +1,127 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifdef CCL_ENABLE_MPI
+
+#include "atl/mpi/atl_mpi_comm.hpp"
+#include "exec/exec.hpp"
+
+std::atomic<size_t> atl_mpi_comm::comm_count{ 0 };
+atl_mpi* atl_mpi_comm::transport{ nullptr };
+
+atl_mpi_comm::~atl_mpi_comm() {
+    static std::mutex memory_mutex;
+    std::lock_guard<std::mutex> lock(memory_mutex);
+    tag.reset();
+    comm_count--;
+    if (comm_count.load() == 0) {
+        delete transport;
+        transport = nullptr;
+    }
+}
+
+atl_mpi_comm::atl_mpi_comm() {
+    init_transport(true);
+}
+
+atl_mpi_comm::atl_mpi_comm(std::shared_ptr<ikvs_wrapper> k) : atl_mpi_comm() {
+    (void)k;
+}
+
+atl_mpi_comm::atl_mpi_comm(int total_rank_count,
+                           const std::vector<int>& ranks,
+                           std::shared_ptr<ikvs_wrapper> k)
+        : atl_mpi_comm() {
+    (void)total_rank_count;
+    (void)ranks;
+    (void)k;
+}
+
+atl_mpi_comm::atl_mpi_comm(std::vector<atl_mpi_ep_t>& parent_eps,
+                           int parent_rank,
+                           int parent_size,
+                           int color) {
+    this->parent_rank = parent_rank;
+    this->parent_size = parent_size;
+
+    transport->comm_split(parent_eps, eps, color);
+    transport->coord_update(eps[0].mpi_comm, coord);
+    rank = coord.global_idx;
+    size = coord.global_count;
+    init_transport(false);
+    rank2rank_map.resize(size);
+    MPI_Allgather(&parent_rank, 1, MPI_INT, rank2rank_map.data(), 1, MPI_INT, eps[0].mpi_comm);
+}
+
+void atl_mpi_comm::eps_update() {
+    for (auto& ep : eps) {
+        ep.coord = &coord;
+    }
+}
+
+std::shared_ptr<atl_base_comm> atl_mpi_comm::comm_split(int color) {
+    std::shared_ptr<atl_mpi_comm> comm =
+        std::shared_ptr<atl_mpi_comm>(new atl_mpi_comm(eps, parent_rank, parent_size, color));
+
+    return static_cast<std::shared_ptr<atl_mpi_comm>>(comm);
+}
+
+void atl_mpi_comm::init_transport(bool is_new) {
+    LOG_DEBUG("init ATL, requested ep_count ", attr.in.ep_count);
+    if (is_new) {
+        static std::mutex memory_mutex;
+        {
+            std::lock_guard<std::mutex> lock(memory_mutex);
+            if (!transport) {
+                transport = new atl_mpi();
+            }
+            if (!transport->is_inited()) {
+                CCL_THROW_IF_NOT(
+                    transport->init(nullptr, nullptr, &attr, nullptr, pmi) == ATL_STATUS_SUCCESS,
+                    "failed to initialize ATL");
+
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                if (mpi_rank == 0) {
+                    print_atl_attrs();
+                }
+            }
+        }
+
+        transport->ep_init(eps);
+        transport->coord_update(MPI_COMM_WORLD, coord);
+        parent_rank = rank = coord.global_idx;
+        parent_size = size = coord.global_count;
+        rank2rank_map.resize(size);
+
+        for (int i = 0; i < size; i++) {
+            rank2rank_map[i] = i;
+        }
+    }
+
+    threads_per_process = 1;
+    ranks_per_process = 1;
+
+    eps_update();
+    init_tag();
+
+    comm_count++;
+
+    executor_update();
+}
+std::vector<int> atl_mpi_comm::get_rank2rank_map() {
+    return rank2rank_map;
+}
+#endif //CCL_ENABLE_MPI
diff --git a/src/atl/mpi/atl_mpi_comm.hpp b/src/atl/mpi/atl_mpi_comm.hpp
new file mode 100644
index 000000000..564f4b0b8
--- /dev/null
+++ b/src/atl/mpi/atl_mpi_comm.hpp
@@ -0,0 +1,251 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifdef CCL_ENABLE_MPI
+
+#include <mpi.h>
+
+#include "atl/atl_base_comm.hpp"
+#include "atl/mpi/atl_mpi.hpp"
+
+class atl_mpi_comm : public atl_base_comm {
+public:
+    ~atl_mpi_comm() override;
+
+    atl_mpi_comm();
+    atl_mpi_comm(std::shared_ptr<ikvs_wrapper> k);
+    atl_mpi_comm(int total_rank_count,
+                 const std::vector<int>& ranks,
+                 std::shared_ptr<ikvs_wrapper> k);
+
+    atl_status_t main_addr_reserve(char* main_addr) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t finalize() override {
+        transport->comms_free(eps);
+        return ATL_STATUS_SUCCESS;
+    }
+
+    atl_status_t update() override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t wait_notification() override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t set_resize_function(atl_resize_fn_t fn) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t mr_reg(const void* buf, size_t len, atl_mr_t** mr) override {
+        return transport->mr_reg(buf, len, mr);
+    }
+
+    atl_status_t mr_dereg(atl_mr_t* mr) override {
+        return transport->mr_dereg(mr);
+    }
+
+    atl_status_t send(size_t ep_idx,
+                      const void* buf,
+                      size_t len,
+                      int dst_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req) override {
+        return transport->send(eps[ep_idx], buf, len, dst_proc_idx, tag, req);
+    }
+
+    atl_status_t recv(size_t ep_idx,
+                      void* buf,
+                      size_t len,
+                      int src_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req) override {
+        return transport->recv(eps[ep_idx], buf, len, src_proc_idx, tag, req);
+    }
+
+    atl_status_t probe(size_t ep_idx,
+                       int src_proc_idx,
+                       uint64_t tag,
+                       int* found,
+                       size_t* recv_len) override {
+        return transport->probe(eps[ep_idx], src_proc_idx, tag, found, recv_len);
+    }
+
+    atl_status_t allgatherv(size_t ep_idx,
+                            const void* send_buf,
+                            size_t send_len,
+                            void* recv_buf,
+                            const int* recv_lens,
+                            const int* offsets,
+                            atl_req_t* req) override {
+        return transport->allgatherv(
+            eps[ep_idx], send_buf, send_len, recv_buf, recv_lens, offsets, req);
+    }
+
+    atl_status_t allreduce(size_t ep_idx,
+                           const void* send_buf,
+                           void* recv_buf,
+                           size_t len,
+                           atl_datatype_t dtype,
+                           atl_reduction_t op,
+                           atl_req_t* req) override {
+        return transport->allreduce(eps[ep_idx], send_buf, recv_buf, len, dtype, op, req);
+    }
+
+    atl_status_t alltoall(size_t ep_idx,
+                          const void* send_buf,
+                          void* recv_buf,
+                          int len,
+                          atl_req_t* req) override {
+        return transport->alltoall(eps[ep_idx], send_buf, recv_buf, len, req);
+    }
+
+    atl_status_t alltoallv(size_t ep_idx,
+                           const void* send_buf,
+                           const int* send_lens,
+                           const int* send_offsets,
+                           void* recv_buf,
+                           const int* recv_lens,
+                           const int* recv_offsets,
+                           atl_req_t* req) override {
+        return transport->alltoallv(
+            eps[ep_idx], send_buf, send_lens, send_offsets, recv_buf, recv_lens, recv_offsets, req);
+    }
+
+    atl_status_t barrier(size_t ep_idx, atl_req_t* req) override {
+        return transport->barrier(eps[ep_idx], req);
+    }
+
+    atl_status_t bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t* req) override {
+        return transport->bcast(eps[ep_idx], buf, len, root, req);
+    }
+
+    atl_status_t reduce(size_t ep_idx,
+                        const void* send_buf,
+                        void* recv_buf,
+                        size_t len,
+                        int root,
+                        atl_datatype_t dtype,
+                        atl_reduction_t op,
+                        atl_req_t* req) override {
+        return transport->reduce(eps[ep_idx], send_buf, recv_buf, len, root, dtype, op, req);
+    }
+
+    atl_status_t reduce_scatter(size_t ep_idx,
+                                const void* send_buf,
+                                void* recv_buf,
+                                size_t recv_len,
+                                atl_datatype_t dtype,
+                                atl_reduction_t op,
+                                atl_req_t* req) override {
+        return transport->reduce_scatter(eps[ep_idx], send_buf, recv_buf, recv_len, dtype, op, req);
+    }
+
+    atl_status_t read(size_t ep_idx,
+                      void* buf,
+                      size_t len,
+                      atl_mr_t* mr,
+                      uint64_t addr,
+                      uintptr_t remote_key,
+                      int dst_proc_idx,
+                      atl_req_t* req) override {
+        return transport->read(eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
+    }
+
+    atl_status_t write(size_t ep_idx,
+                       const void* buf,
+                       size_t len,
+                       atl_mr_t* mr,
+                       uint64_t addr,
+                       uintptr_t remote_key,
+                       int dst_proc_idx,
+                       atl_req_t* req) override {
+        return transport->write(eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
+    }
+
+    atl_status_t wait(size_t ep_idx, atl_req_t* req) override {
+        return transport->wait(eps[ep_idx], req);
+    }
+
+    atl_status_t wait_all(size_t ep_idx, atl_req_t* req, size_t count) override {
+        return transport->wait_all(eps[ep_idx], req, count);
+    }
+
+    atl_status_t cancel(size_t ep_idx, atl_req_t* req) override {
+        return transport->cancel(eps[ep_idx], req);
+    }
+
+    atl_status_t poll(size_t ep_idx) override {
+        return transport->poll(eps[ep_idx]);
+    }
+
+    atl_status_t check(size_t ep_idx, atl_req_t* req) override {
+        return transport->check(eps[ep_idx], req);
+    }
+
+    size_t get_threads_per_process() override {
+        return threads_per_process;
+    }
+
+    size_t get_ranks_per_process() override {
+        return ranks_per_process;
+    }
+
+    int get_rank() override {
+        return rank;
+    }
+
+    int get_size() override {
+        return size;
+    }
+
+    int get_r2r_color() override {
+        return coord.local_idx;
+    }
+
+    int get_host_color() override {
+        return coord.hostname_hash;
+    }
+
+    /*
+     * TODO: Temporary change.
+     * Need to define correct to unique id
+     */
+    size_t get_id() override {
+        return 0;
+    }
+
+    std::shared_ptr<atl_base_comm> comm_split(int color) override;
+
+    std::vector<int> get_rank2rank_map() override;
+
+private:
+    atl_mpi_comm(std::vector<atl_mpi_ep_t>& parent_eps,
+                 int parent_rank,
+                 int parent_size,
+                 int color);
+    void eps_update();
+    std::vector<atl_mpi_ep_t> eps;
+    static atl_mpi* transport;
+    static std::atomic<size_t> comm_count;
+
+    void init_transport(bool is_new);
+};
+
+#endif //CCL_ENABLE_MPI
diff --git a/src/atl/mpi/atl_mpi_global_data.cpp b/src/atl/mpi/atl_mpi_global_data.cpp
new file mode 100644
index 000000000..d45da0028
--- /dev/null
+++ b/src/atl/mpi/atl_mpi_global_data.cpp
@@ -0,0 +1,700 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#ifdef CCL_ENABLE_MPI
+
+#include "atl/mpi/atl_mpi.hpp"
+#include "atl/mpi/atl_mpi_global_data.hpp"
+#include "common/global/global.hpp"
+#include "common/log/log.hpp"
+
+void check_op_params(void* in_buf,
+                     void* inout_buf,
+                     int* length,
+                     MPI_Datatype* datatype,
+                     const char* caller_func_name) {
+    (void)datatype;
+    CCL_THROW_IF_NOT(in_buf && inout_buf && length,
+                     caller_func_name,
+                     " requested, bad arguments: ",
+                     in_buf,
+                     " ",
+                     inout_buf,
+                     " ",
+                     length);
+}
+
+#ifdef ATL_MPI_FP16
+
+void FP16_INLINE_TARGET_ATTRIBUTE_ALL fp16_base_op(void* in,
+                                                   void* inout,
+                                                   int* length,
+                                                   ccl::reduction op) {
+    unsigned short* in_buf = (unsigned short*)in;
+    unsigned short* inout_buf = (unsigned short*)inout;
+
+    size_t len = *length;
+    ccl_fp16_reduce_impl(in_buf, inout_buf, len, op);
+}
+
+void FP16_TARGET_ATTRIBUTE_ALL fp16_sum_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    fp16_base_op(in, inout, length, ccl::reduction::sum);
+}
+
+void FP16_TARGET_ATTRIBUTE_ALL fp16_prod_op(void* in,
+                                            void* inout,
+                                            int* length,
+                                            MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    fp16_base_op(in, inout, length, ccl::reduction::prod);
+}
+
+void FP16_TARGET_ATTRIBUTE_ALL fp16_min_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    fp16_base_op(in, inout, length, ccl::reduction::min);
+}
+
+void FP16_TARGET_ATTRIBUTE_ALL fp16_max_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    fp16_base_op(in, inout, length, ccl::reduction::max);
+}
+#endif // ATL_MPI_FP16
+
+#ifdef ATL_MPI_BF16
+
+void BF16_INLINE_TARGET_ATTRIBUTE_ALL bf16_base_op(void* in,
+                                                   void* inout,
+                                                   int* length,
+                                                   ccl::reduction op) {
+    unsigned short* in_buf = (unsigned short*)in;
+    unsigned short* inout_buf = (unsigned short*)inout;
+
+    size_t len = *length;
+    ccl_bf16_reduce_impl(in_buf, inout_buf, len, op);
+}
+
+void BF16_TARGET_ATTRIBUTE_ALL bf16_sum_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    bf16_base_op(in, inout, length, ccl::reduction::sum);
+}
+
+void BF16_TARGET_ATTRIBUTE_ALL bf16_prod_op(void* in,
+                                            void* inout,
+                                            int* length,
+                                            MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    bf16_base_op(in, inout, length, ccl::reduction::prod);
+}
+
+void BF16_TARGET_ATTRIBUTE_ALL bf16_min_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    bf16_base_op(in, inout, length, ccl::reduction::min);
+}
+
+void BF16_TARGET_ATTRIBUTE_ALL bf16_max_op(void* in,
+                                           void* inout,
+                                           int* length,
+                                           MPI_Datatype* datatype) {
+    check_op_params(in, inout, length, datatype, __FUNCTION__);
+    bf16_base_op(in, inout, length, ccl::reduction::max);
+}
+#endif // ATL_MPI_BF16
+
+void atl_mpi_global_data::print_error(int error) {
+    char str_error[MPI_MAX_ERROR_STRING];
+    int result_len = MPI_MAX_ERROR_STRING;
+
+    MPI_Error_string(error, str_error, &result_len);
+
+    if (result_len > MPI_MAX_ERROR_STRING) {
+        result_len = MPI_MAX_ERROR_STRING;
+    }
+    str_error[result_len - 1] = '\0';
+
+    ccl_logger::format(std::cout, "MPI error: %s (%d)", str_error, error);
+}
+
+atl_status_t atl_mpi_global_data::set_impi_env(const atl_attr_t& attr,
+                                               const atl_mpi_lib_attr_t& lib_attr) {
+    char ep_count_str[MPI_MAX_INFO_VAL] = { 0 };
+    snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", get_ep_count(attr));
+
+    if (attr.in.ep_count)
+        setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0);
+
+#ifdef CCL_ENABLE_SYCL
+    setenv("I_MPI_SHM_CMA", "0", 0);
+    if (attr.in.enable_hmem && lib_attr.hmem) {
+        setenv("I_MPI_OFFLOAD", "2", 0);
+        setenv("I_MPI_OFFLOAD_TOPOLIB", "l0", 0);
+        setenv("I_MPI_OFFLOAD_QUEUE_CACHE", "1", 0);
+        setenv("I_MPI_OFFLOAD_LIST_CACHE", "1", 0);
+        setenv("I_MPI_OFFLOAD_MEMCPY_KIND", "blocked", 0);
+        if (attr.in.ep_count > 1) {
+            /* try to set global lock level before vci level
+               because setenv is invoked with overwrite=0 */
+            setenv("I_MPI_THREAD_LOCK_LEVEL", "global", 0);
+        }
+    }
+#endif // CCL_ENABLE_SYCL
+
+    setenv("I_MPI_THREAD_SPLIT", "1", 0);
+    setenv("I_MPI_THREAD_RUNTIME", "generic", 0);
+    setenv("I_MPI_THREAD_MAX", ep_count_str, 0);
+    setenv("I_MPI_THREAD_ID_KEY", EP_IDX_KEY, 0);
+    setenv("I_MPI_THREAD_LOCK_LEVEL", "vci", 0);
+
+    return ATL_STATUS_SUCCESS;
+}
+
+size_t atl_mpi_global_data::get_ep_count(const atl_attr_t& attr) {
+    size_t mpi_ep_count = attr.in.ep_count;
+    if (attr.in.enable_extra_ep)
+        mpi_ep_count += attr.in.enable_extra_ep;
+    return mpi_ep_count;
+}
+
+atl_mpi_global_data::atl_mpi_lib_attr_t atl_mpi_global_data::get_lib_attr() {
+    atl_mpi_lib_attr_t lib_attr = { ATL_MPI_LIB_NONE, 0 };
+
+    char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING] = { 0 };
+    int mpi_version_len = -1, i;
+    const atl_mpi_lib_info_t* final_info = NULL;
+
+    /* can be called before MPI_Init */
+    int ret = MPI_Get_library_version(mpi_version, &mpi_version_len);
+
+    if ((ret != MPI_SUCCESS) || (mpi_version_len < 0) ||
+        (mpi_version_len > MPI_MAX_LIBRARY_VERSION_STRING)) {
+        LOG_WARN("can not retrieve MPI version, mpi_version_len ", mpi_version_len, ", ret", ret);
+        return lib_attr;
+    }
+
+    /* remove trailing spaces at the end for more compact log */
+    while (strlen(mpi_version) && isspace(mpi_version[strlen(mpi_version) - 1]))
+        mpi_version[strlen(mpi_version) - 1] = '\0';
+
+    LOG_DEBUG("MPI version: ", mpi_version);
+
+    /* for filtering */
+    char* lib_type_env = getenv("CCL_ATL_MPI");
+
+    for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) {
+        const atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]);
+
+        if (info->type == ATL_MPI_LIB_NONE)
+            continue;
+
+        if (lib_type_env) {
+            if (strcmp(lib_type_env, info->name)) {
+                LOG_DEBUG("library ", info->name, " is filtered out by user input ", lib_type_env);
+                continue;
+            }
+            else {
+                LOG_DEBUG("use lib_type = ", lib_type_env, " because it is requested explicitly");
+            }
+        }
+
+        CCL_THROW_IF_NOT(info->version_prefix_1, "empty version_prefix_1");
+        CCL_THROW_IF_NOT(info->min_version_value >= 0, "unexpected minimal version");
+
+        const char* version_substr = NULL;
+        if ((version_substr = strstr(mpi_version, info->version_prefix_1))) {
+            version_substr += strlen(info->version_prefix_1);
+            LOG_DEBUG("version_substr: ", version_substr);
+
+            if (info->version_prefix_2) {
+                version_substr = strstr(version_substr, info->version_prefix_2);
+                if (!version_substr) {
+                    LOG_DEBUG("can't find version_prefix_2 ", info->version_prefix_2);
+                    continue;
+                }
+                version_substr += strlen(info->version_prefix_2);
+                LOG_DEBUG("version_substr: ", version_substr);
+            }
+
+            int version_value = (version_substr) ? atoi(version_substr) : -1;
+            LOG_DEBUG("MPI numerical version: ", version_value);
+
+            if (version_value < info->min_version_value) {
+                LOG_WARN("loaded MPI doesn't match with expected version, "
+                         "consider to switch to ",
+                         info->version_prefix_1,
+                         " ",
+                         (info->version_prefix_2 ? info->version_prefix_2 : ""),
+                         info->min_version_value,
+                         " (min) ",
+                         (info->kind_value ? info->kind_value : ""),
+                         "\n");
+                continue;
+            }
+
+            if (info->kind_prefix && info->kind_value) {
+                const char* kind_substr = mpi_version;
+
+                if ((kind_substr = strstr(kind_substr, info->kind_prefix))) {
+                    kind_substr += strlen(info->kind_prefix);
+                    while ((isspace(*kind_substr)) &&
+                           (kind_substr < (mpi_version + mpi_version_len)))
+                        kind_substr++;
+
+                    LOG_DEBUG("kind_substr: ", kind_substr);
+
+                    if (strncmp(kind_substr, info->kind_value, strlen(info->kind_value))) {
+                        LOG_WARN("loaded MPI version (",
+                                 version_value,
+                                 ") ",
+                                 "is higher or equal to minimal expected version (",
+                                 info->min_version_value,
+                                 ") ",
+                                 "but kind (",
+                                 kind_substr,
+                                 ") doesn't match with expected kind (",
+                                 info->kind_value,
+                                 "), "
+                                 "consider to switch to ",
+                                 info->version_prefix_1,
+                                 " ",
+                                 (info->version_prefix_2 ? info->version_prefix_2 : ""),
+                                 info->min_version_value,
+                                 " (min version) ",
+                                 (info->kind_value ? info->kind_value : ""),
+                                 "\n");
+                    }
+                }
+                else {
+                    LOG_DEBUG("MPI version is high enough, but kind_prefix (",
+                              info->kind_prefix,
+                              ") can not be found",
+                              " treat this like expected kind (",
+                              info->kind_value,
+                              ") was found");
+                }
+            }
+
+            final_info = info;
+            LOG_DEBUG("set lib_type = ",
+                      info->name,
+                      " because "
+                      "version (",
+                      version_value,
+                      ") is higher or equal to minimal expected version (",
+                      info->min_version_value,
+                      ")");
+
+            lib_attr.type = final_info->type;
+            lib_attr.hmem = (final_info->min_hmem_version_value >= version_value) ? 1 : 0;
+
+            break;
+        }
+    }
+
+    if (final_info) {
+        LOG_DEBUG("MPI library type: ", final_info->name);
+    }
+    else {
+        LOG_DEBUG("MPI library type: none");
+    }
+
+    return lib_attr;
+}
+
+int atl_mpi_global_data::bf16_init() {
+    if (ccl::global_data::env().bf16_impl_type <= ccl_bf16_no_hardware_support) {
+        return ATL_STATUS_SUCCESS;
+    }
+
+#ifdef ATL_MPI_BF16
+
+    int ret = MPI_SUCCESS;
+    // create custom MPI BF16 dtype
+    ret = MPI_Type_contiguous(2, MPI_BYTE, &bf16.dtype);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI BF16 dtype");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    ret = MPI_Type_commit(&bf16.dtype);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot commit MPI BF16 type");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI BF16 summation op
+    ret = MPI_Op_create(&bf16_sum_op, 1, &bf16.sum_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI BF16 sum op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI BF16 production op
+    ret = MPI_Op_create(&bf16_prod_op, 1, &bf16.prod_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI BF16 prod op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI BF16 min op
+    ret = MPI_Op_create(&bf16_min_op, 1, &bf16.min_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI BF16 min op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI BF16 max op
+    ret = MPI_Op_create(&bf16_max_op, 1, &bf16.max_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI BF16 max op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+#endif // ATL_MPI_BF16
+
+    return ATL_STATUS_SUCCESS;
+}
+
+void atl_mpi_global_data::bf16_finalize() {
+    if (bf16.dtype != MPI_DATATYPE_NULL) {
+        MPI_Type_free(&bf16.dtype);
+    }
+
+    if (bf16.sum_op != MPI_OP_NULL) {
+        MPI_Op_free(&bf16.sum_op);
+    }
+
+    if (bf16.prod_op != MPI_OP_NULL) {
+        MPI_Op_free(&bf16.prod_op);
+    }
+
+    if (bf16.min_op != MPI_OP_NULL) {
+        MPI_Op_free(&bf16.min_op);
+    }
+
+    if (bf16.max_op != MPI_OP_NULL) {
+        MPI_Op_free(&bf16.max_op);
+    }
+}
+
+int atl_mpi_global_data::fp16_init() {
+    if (ccl::global_data::env().fp16_impl_type <= ccl_fp16_no_hardware_support) {
+        return ATL_STATUS_SUCCESS;
+    }
+
+#ifdef ATL_MPI_FP16
+
+    int ret = MPI_SUCCESS;
+
+    // create custom MPI FP16 dtype
+    ret = MPI_Type_contiguous(2, MPI_BYTE, &fp16.dtype);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI FP16 dtype");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    ret = MPI_Type_commit(&fp16.dtype);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot commit MPI FP16 type");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI FP16 summation op
+    ret = MPI_Op_create(&fp16_sum_op, 1, &fp16.sum_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI FP16 sum op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI FP16 production op
+    ret = MPI_Op_create(&fp16_prod_op, 1, &fp16.prod_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI FP16 prod op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI FP16 min op
+    ret = MPI_Op_create(&fp16_min_op, 1, &fp16.min_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI FP16 min op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+    // create custom MPI FP16 max op
+    ret = MPI_Op_create(&fp16_max_op, 1, &fp16.max_op);
+    if (ret != MPI_SUCCESS) {
+        LOG_ERROR("cannot create MPI FP16 max op");
+        print_error(ret);
+        return ATL_STATUS_FAILURE;
+    }
+
+#endif // ATL_MPI_FP16
+
+    return ATL_STATUS_SUCCESS;
+}
+
+void atl_mpi_global_data::fp16_finalize() {
+    if (fp16.dtype != MPI_DATATYPE_NULL) {
+        MPI_Type_free(&fp16.dtype);
+    }
+
+    if (fp16.sum_op != MPI_OP_NULL) {
+        MPI_Op_free(&fp16.sum_op);
+    }
+
+    if (fp16.prod_op != MPI_OP_NULL) {
+        MPI_Op_free(&fp16.prod_op);
+    }
+
+    if (fp16.min_op != MPI_OP_NULL) {
+        MPI_Op_free(&fp16.min_op);
+    }
+
+    if (fp16.max_op != MPI_OP_NULL) {
+        MPI_Op_free(&fp16.max_op);
+    }
+}
+
+atl_status_t atl_mpi_global_data::check_impi_env(const atl_attr_t& attr) {
+    char* ep_count_env = getenv("I_MPI_THREAD_MAX");
+    if (!ep_count_env)
+        return ATL_STATUS_FAILURE;
+    if (atoi(ep_count_env) != (int)(get_ep_count(attr)))
+        return ATL_STATUS_FAILURE;
+
+    if (!getenv("I_MPI_ROOT")) {
+        atl_mpi_lib_type_t type = ATL_MPI_LIB_IMPI;
+        LOG_ERROR("CCL/MPI uses ",
+                  mpi_lib_infos[type].version_prefix_1,
+                  " but I_MPI_ROOT is not set. ",
+                  "Please source ",
+                  mpi_lib_infos[type].kind_value,
+                  " version of ",
+                  mpi_lib_infos[type].version_prefix_1,
+                  " (",
+                  mpi_lib_infos[type].min_version_value,
+                  " or higher version).");
+        return ATL_STATUS_FAILURE;
+    }
+
+    return ATL_STATUS_SUCCESS;
+}
+
+atl_status_t atl_mpi_global_data::update_global_data(atl_attr_t* attr) {
+    if (mpi_lib_attr.type == ATL_MPI_LIB_NONE)
+        mpi_lib_attr = get_lib_attr();
+
+    extra_ep = attr->in.enable_extra_ep;
+
+    mnic_type = attr->in.mnic_type;
+    if (mpi_lib_attr.type != ATL_MPI_LIB_MPICH) {
+        /* only MPICH supports multi-NIC */
+        mnic_type = ATL_MNIC_NONE;
+    }
+
+    if (mnic_type == ATL_MNIC_LOCAL) {
+        mnic_count = get_nic_count(LOCAL_NIC_COUNT_KEY);
+    }
+    else if (mnic_type == ATL_MNIC_GLOBAL) {
+        mnic_count = get_nic_count(GLOBAL_NIC_COUNT_KEY);
+    }
+    else if (mnic_type == ATL_MNIC_NONE) {
+        mnic_count = 1;
+    }
+    mnic_count = std::min(mnic_count, attr->in.mnic_count);
+    mnic_count = std::max(mnic_count, (size_t)(1));
+    mnic_offset = attr->in.mnic_offset;
+
+    if (bf16_init() == ATL_STATUS_FAILURE) {
+        bf16_finalize();
+        return ATL_STATUS_FAILURE;
+    }
+
+    if (fp16_init() == ATL_STATUS_FAILURE) {
+        fp16_finalize();
+        return ATL_STATUS_FAILURE;
+    }
+    return ATL_STATUS_SUCCESS;
+}
+
+atl_status_t atl_mpi_global_data::set_mpich_env(const atl_attr_t& attr) {
+    char ep_count_str[MPI_MAX_INFO_VAL] = { 0 };
+    snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", get_ep_count(attr));
+
+    setenv("MPIR_CVAR_CH4_MT_MODEL", "direct", 0);
+    setenv("MPIR_CVAR_CH4_NUM_VCIS", ep_count_str, 0);
+    setenv("MPIR_CVAR_CH4_OFI_MAX_VCIS", ep_count_str, 0);
+    setenv("MPIR_COMM_HINT_VCI", EP_IDX_KEY, 0);
+
+    auto& env = ccl::global_data::env();
+    if (env.log_level >= ccl_log_level::debug) {
+        setenv("MPIR_CVAR_CH4_RUNTIME_CONF_DEBUG", "1", 0);
+        setenv("MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG", "1", 0);
+        setenv("MPIR_CVAR_DEBUG_SUMMARY", "1", 0);
+    }
+
+    setenv("FI_PSM2_DELAY", "0", 0);
+    setenv("FI_PSM2_TIMEOUT", "0", 0);
+    setenv("FI_PSM2_NAME_SERVER", "0", 0);
+    setenv("HFI_NO_CPUAFFINITY", "1", 0);
+
+    return ATL_STATUS_SUCCESS;
+}
+
+/* set these knobs without detection of MPI library type */
+atl_status_t atl_mpi_global_data::set_base_env(const atl_attr_t& attr) {
+    setenv("PSM2_MULTI_EP", "1", 0);
+    setenv("FI_OFI_RXM_USE_HASH", "0", 0);
+
+#ifdef CCL_ENABLE_SYCL
+    setenv("FI_SHM_DISABLE_CMA", "1", 0);
+#endif // CCL_ENABLE_SYCL
+
+    setenv("MPIR_CVAR_DEFAULT_THREAD_LEVEL", "MPI_THREAD_MULTIPLE", 0);
+
+    /* request IMPI level append library kind into MPI_Get_library_version output */
+    setenv("I_MPI_INFO_LIBRARY_KIND", "1", 0);
+
+    return ATL_STATUS_SUCCESS;
+}
+
+atl_status_t atl_mpi_global_data::set_env(const atl_attr_t& attr) {
+    if (mpi_lib_attr.type != ATL_MPI_LIB_NONE) {
+        /* library type was already detected and env was set, make sanity check */
+        if (mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
+            return check_impi_env(attr);
+        }
+        else if (mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
+            return check_mpich_env(attr);
+        }
+        return ATL_STATUS_SUCCESS;
+    }
+
+    set_base_env(attr);
+
+    mpi_lib_attr = get_lib_attr();
+
+    if (mpi_lib_attr.type == ATL_MPI_LIB_NONE) {
+        return ATL_STATUS_SUCCESS;
+    }
+
+    if (mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
+        set_impi_env(attr, mpi_lib_attr);
+        check_impi_env(attr);
+    }
+    else if (mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
+        set_mpich_env(attr);
+        check_mpich_env(attr);
+    }
+
+    int is_mpi_inited = 0;
+    MPI_Initialized(&is_mpi_inited);
+    if (is_mpi_inited) {
+        LOG_WARN("MPI was initialized externally, CCL-MPI specific environment is ignored");
+    }
+    else {
+        LOG_DEBUG("set CCL-MPI specific environment");
+    }
+
+    return ATL_STATUS_SUCCESS;
+}
+
+atl_status_t atl_mpi_global_data::check_mpich_env(const atl_attr_t& attr) {
+    char* ep_count_env = getenv("MPIR_CVAR_CH4_OFI_MAX_VCIS");
+    if (!ep_count_env)
+        return ATL_STATUS_FAILURE;
+    if (atoi(ep_count_env) != (int)(get_ep_count(attr)))
+        return ATL_STATUS_FAILURE;
+    return ATL_STATUS_SUCCESS;
+}
+
+#ifdef ATL_MPI_BF16
+MPI_Op atl_mpi_global_data::atl2mpi_op_bf16(atl_reduction_t rtype) {
+    switch (rtype) {
+        case ATL_REDUCTION_SUM: return bf16.sum_op;
+        case ATL_REDUCTION_PROD: return bf16.prod_op;
+        case ATL_REDUCTION_MIN: return bf16.min_op;
+        case ATL_REDUCTION_MAX: return bf16.max_op;
+        default: printf("unknown reduction type: %d\n", rtype); exit(1);
+    }
+}
+#endif // ATL_MPI_BF16
+
+#ifdef ATL_MPI_FP16
+MPI_Op atl_mpi_global_data::atl2mpi_op_fp16(atl_reduction_t rtype) {
+    switch (rtype) {
+        case ATL_REDUCTION_SUM: return fp16.sum_op;
+        case ATL_REDUCTION_PROD: return fp16.prod_op;
+        case ATL_REDUCTION_MIN: return fp16.min_op;
+        case ATL_REDUCTION_MAX: return fp16.max_op;
+        default: printf("unknown reduction type: %d\n", rtype); exit(1);
+    }
+}
+#endif // ATL_MPI_FP16
+
+void atl_mpi_global_data::print_log_info() {
+    if (ctx_count == 1) {
+        LOG_INFO("atl-mpi-global:")
+        LOG_INFO("  is_external_init: ", is_external_init);
+        LOG_INFO("  mpi_lib_attr.type: ", mpi_lib_infos[mpi_lib_attr.type].name);
+        LOG_INFO("  mpi_lib_attr.hmem: ", mpi_lib_attr.hmem);
+        LOG_INFO("  extra_ep: ", extra_ep);
+        LOG_INFO("  mnic_type: ", to_string(mnic_type));
+        if (mnic_type != ATL_MNIC_NONE) {
+            LOG_INFO("  mnic_count: ", mnic_count);
+            LOG_INFO("  mnic_offset: ", to_string(mnic_offset));
+        }
+    }
+}
+
+size_t atl_mpi_global_data::get_nic_count(const char* nic_count_key) {
+    size_t count = 1;
+    atl_mpi_env_info_t info = atl_mpi::get_env_info(nic_count_key);
+    CCL_THROW_IF_NOT(info.found, "MPI env key ", nic_count_key, " was not set");
+
+    count = atoi(info.value);
+    if (count <= 0) {
+        count = 1;
+    }
+
+    return count;
+}
+
+#endif // CCL_ENABLE_MPI
diff --git a/src/atl/mpi/atl_mpi_global_data.hpp b/src/atl/mpi/atl_mpi_global_data.hpp
new file mode 100644
index 000000000..e0affbe5c
--- /dev/null
+++ b/src/atl/mpi/atl_mpi_global_data.hpp
@@ -0,0 +1,161 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifdef CCL_ENABLE_MPI
+
+#include <mpi.h>
+
+#include "atl/atl_def.h"
+#include "comp/bf16/bf16_intrisics.hpp"
+#include "comp/fp16/fp16_intrisics.hpp"
+
+#ifdef CCL_BF16_COMPILER
+#define ATL_MPI_BF16
+#endif // CCL_BF16_COMPILER
+
+#ifdef CCL_FP16_COMPILER
+#define ATL_MPI_FP16
+#endif // CCL_FP16_COMPILER
+
+class atl_mpi_global_data {
+public:
+    typedef enum { ATL_MPI_LIB_IMPI, ATL_MPI_LIB_MPICH, ATL_MPI_LIB_NONE } atl_mpi_lib_type_t;
+
+private:
+    typedef struct {
+        atl_mpi_lib_type_t type;
+        int hmem;
+    } atl_mpi_lib_attr_t;
+    typedef struct {
+        // custom MPI operations for BF16
+        MPI_Op sum_op;
+        MPI_Op prod_op;
+        MPI_Op min_op;
+        MPI_Op max_op;
+        // custom MPI dtype for BF16
+        MPI_Datatype dtype;
+    } atl_mpi_bf16_data_t;
+
+    typedef struct {
+        // custom MPI operations for FP16
+        MPI_Op sum_op;
+        MPI_Op prod_op;
+        MPI_Op min_op;
+        MPI_Op max_op;
+        // custom MPI dtype for FP16
+        MPI_Datatype dtype;
+    } atl_mpi_fp16_data_t;
+
+    typedef struct {
+        atl_mpi_lib_type_t type;
+        const char* name;
+
+        /* string prefix before numerical version of library, mandatory */
+        const char* version_prefix_1;
+
+        /* string prefix before numerical version of library, following prefix_1, optional */
+        const char* version_prefix_2;
+
+        /* minimal expected version of library, mandatory */
+        int min_version_value;
+
+        /* minimal expected version of library with hmem support, mandatory */
+        int min_hmem_version_value;
+
+        /* string prefix before library kind, optional */
+        const char* kind_prefix;
+
+        /* library kind, optional */
+        const char* kind_value;
+    } atl_mpi_lib_info_t;
+
+#define MPI_LIB_INFO_MAX_COUNT 3
+
+    const atl_mpi_lib_info_t mpi_lib_infos[MPI_LIB_INFO_MAX_COUNT] = {
+        { ATL_MPI_LIB_IMPI,
+          "impi",
+          "Intel(R) MPI Library",
+          NULL,
+          2019,
+          2021,
+          "library kind:",
+          "release" },
+        { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, -1, NULL, NULL },
+        { ATL_MPI_LIB_NONE, "none", "", NULL, 0, -1, NULL, NULL },
+    };
+
+    size_t get_nic_count(const char* nic_count_key);
+
+public:
+    const char* EP_IDX_KEY = "vci";
+
+    const char* NIC_IDX_KEY = "multi_nic_pref_nic";
+    const char* GLOBAL_NIC_COUNT_KEY = "num_nics";
+    const char* LOCAL_NIC_COUNT_KEY = "num_close_nics";
+
+    int is_external_init;
+    size_t ctx_count;
+    int extra_ep;
+    atl_mnic_t mnic_type;
+    size_t mnic_count;
+    atl_mnic_offset_t mnic_offset;
+    atl_mpi_lib_attr_t mpi_lib_attr;
+    atl_mpi_bf16_data_t bf16;
+    atl_mpi_fp16_data_t fp16;
+
+    atl_mpi_global_data()
+            : is_external_init(0),
+              ctx_count(0),
+              extra_ep(0),
+              mnic_type(ATL_MNIC_NONE),
+              mnic_count(1),
+              mnic_offset(ATL_MNIC_OFFSET_NONE) {
+        mpi_lib_attr.type = ATL_MPI_LIB_NONE;
+        mpi_lib_attr.hmem = 0;
+
+        bf16.dtype = MPI_DATATYPE_NULL;
+        bf16.sum_op = MPI_OP_NULL;
+        bf16.prod_op = MPI_OP_NULL;
+        bf16.min_op = MPI_OP_NULL;
+        bf16.max_op = MPI_OP_NULL;
+
+        fp16.dtype = MPI_DATATYPE_NULL;
+        fp16.sum_op = MPI_OP_NULL;
+        fp16.prod_op = MPI_OP_NULL;
+        fp16.min_op = MPI_OP_NULL;
+        fp16.max_op = MPI_OP_NULL;
+    }
+    atl_mpi_lib_attr_t get_lib_attr();
+    size_t get_ep_count(const atl_attr_t& attr);
+    atl_status_t set_impi_env(const atl_attr_t& attr, const atl_mpi_lib_attr_t& lib_attr);
+    int bf16_init();
+    void bf16_finalize();
+    int fp16_init();
+    void fp16_finalize();
+    void print_error(int error);
+    atl_status_t check_impi_env(const atl_attr_t& attr);
+    atl_status_t update_global_data(atl_attr_t* attr);
+    atl_status_t set_mpich_env(const atl_attr_t& attr);
+    atl_status_t set_base_env(const atl_attr_t& attr);
+    atl_status_t check_mpich_env(const atl_attr_t& attr);
+    atl_status_t set_env(const atl_attr_t& attr);
+    MPI_Op atl2mpi_op_fp16(atl_reduction_t rtype);
+    MPI_Op atl2mpi_op_bf16(atl_reduction_t rtype);
+    void print_log_info();
+};
+
+#endif
diff --git a/src/atl/mpi/atl_mpi_impl.cpp b/src/atl/mpi/atl_mpi_impl.cpp
deleted file mode 100644
index 95636ac82..000000000
--- a/src/atl/mpi/atl_mpi_impl.cpp
+++ /dev/null
@@ -1,1711 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#ifdef CCL_ENABLE_MPI
-
-#include <assert.h>
-#include <ctype.h>
-#include <inttypes.h>
-#include <math.h>
-#include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-#include "atl.h"
-#include "common/global/global.hpp"
-#include "comp/bf16/bf16_intrisics.hpp"
-#include "comp/bf16/bf16_utils.hpp"
-#include "comp/fp16/fp16_intrisics.hpp"
-#include "comp/fp16/fp16_utils.hpp"
-
-#define ATL_MPI_PM_KEY "atl-mpi"
-
-#define EP_IDX_KEY "ep_idx"
-
-#define GLOBAL_NIC_IDX_KEY   "pref_nic"
-#define GLOBAL_NIC_COUNT_KEY "num_nics"
-#define LOCAL_NIC_IDX_KEY    "pref_close_nic"
-#define LOCAL_NIC_COUNT_KEY  "num_close_nics"
-
-#define RET2ATL(ret) (ret != MPI_SUCCESS) ? ATL_STATUS_FAILURE : ATL_STATUS_SUCCESS
-
-typedef enum { ATL_MPI_LIB_IMPI, ATL_MPI_LIB_MPICH, ATL_MPI_LIB_NONE } atl_mpi_lib_type_t;
-
-typedef struct {
-    atl_mpi_lib_type_t type;
-    int hmem;
-} atl_mpi_lib_attr_t;
-
-typedef struct {
-    atl_mpi_lib_type_t type;
-    const char* name;
-
-    /* string prefix before numerical version of library, mandatory */
-    const char* version_prefix_1;
-
-    /* string prefix before numerical version of library, following prefix_1, optional */
-    const char* version_prefix_2;
-
-    /* minimal expected version of library, mandatory */
-    int min_version_value;
-
-    /* minimal expected version of library with hmem support, mandatory */
-    int min_hmem_version_value;
-
-    /* string prefix before library kind, optional */
-    const char* kind_prefix;
-
-    /* library kind, optional */
-    const char* kind_value;
-} atl_mpi_lib_info_t;
-
-#define MPI_LIB_INFO_MAX_COUNT 3
-
-static atl_mpi_lib_info_t mpi_lib_infos[MPI_LIB_INFO_MAX_COUNT] = {
-    { ATL_MPI_LIB_IMPI,
-      "impi",
-      "Intel(R) MPI Library",
-      NULL,
-      2019,
-      2021,
-      "library kind:",
-      "release_mt" },
-    { ATL_MPI_LIB_MPICH, "mpich", "MPICH Custom Information:", "drop", 34, -1, NULL, NULL },
-    { ATL_MPI_LIB_NONE, "none", "", NULL, 0, -1, NULL, NULL },
-};
-
-#ifdef CCL_BF16_COMPILER
-#define ATL_MPI_BF16
-#endif // CCL_BF16_COMPILER
-
-#ifdef CCL_FP16_COMPILER
-#define ATL_MPI_FP16
-#endif // CCL_FP16_COMPILER
-
-typedef struct {
-    // custom MPI operations for BF16
-    MPI_Op sum_op;
-    MPI_Op prod_op;
-    MPI_Op min_op;
-    MPI_Op max_op;
-    // custom MPI dtype for BF16
-    MPI_Datatype dtype;
-} atl_mpi_bf16_data_t;
-
-typedef struct {
-    // custom MPI operations for FP16
-    MPI_Op sum_op;
-    MPI_Op prod_op;
-    MPI_Op min_op;
-    MPI_Op max_op;
-    // custom MPI dtype for FP16
-    MPI_Datatype dtype;
-} atl_mpi_fp16_data_t;
-
-typedef struct atl_mpi_global_data {
-    int is_external_init;
-    size_t ctx_count;
-    int extra_ep;
-    atl_mnic_t mnic_type;
-    size_t mnic_count;
-    atl_mpi_lib_attr_t mpi_lib_attr;
-    atl_mpi_bf16_data_t bf16;
-    atl_mpi_fp16_data_t fp16;
-
-    atl_mpi_global_data()
-            : is_external_init(0),
-              ctx_count(0),
-              extra_ep(0),
-              mnic_type(ATL_MNIC_NONE),
-              mnic_count(1) {
-        mpi_lib_attr.type = ATL_MPI_LIB_NONE;
-        mpi_lib_attr.hmem = 0;
-
-        bf16.dtype = MPI_DATATYPE_NULL;
-        bf16.sum_op = MPI_OP_NULL;
-        bf16.prod_op = MPI_OP_NULL;
-        bf16.min_op = MPI_OP_NULL;
-        bf16.max_op = MPI_OP_NULL;
-
-        fp16.dtype = MPI_DATATYPE_NULL;
-        fp16.sum_op = MPI_OP_NULL;
-        fp16.prod_op = MPI_OP_NULL;
-        fp16.min_op = MPI_OP_NULL;
-        fp16.max_op = MPI_OP_NULL;
-    }
-
-} atl_mpi_global_data_t;
-
-static atl_mpi_global_data_t global_data;
-
-typedef enum { ATL_MPI_COMP_POSTED, ATL_MPI_COMP_COMPLETED } atl_mpi_comp_state_t;
-
-typedef struct {
-    MPI_Request native_req;
-    atl_mpi_comp_state_t comp_state;
-} atl_mpi_req_t;
-
-typedef struct {
-    atl_ctx_t ctx;
-    int sync_coll;
-    atl_progress_mode_t progress_mode;
-} atl_mpi_ctx_t;
-
-typedef struct {
-    atl_ep_t ep;
-    MPI_Comm mpi_comm;
-
-    /* dummy recv operation to ensure progress in atl_poll */
-    atl_mpi_req_t dummy_req;
-    MPI_Comm dummy_comm;
-} atl_mpi_ep_t;
-
-typedef struct atl_mpi_comm_info {
-    int found;
-    MPI_Comm comm;
-    char key[MPI_MAX_INFO_KEY];
-    char value[MPI_MAX_INFO_VAL];
-
-    atl_mpi_comm_info() {
-        found = 0;
-        comm = MPI_COMM_WORLD;
-        memset(key, 0, MPI_MAX_INFO_KEY);
-        memset(value, 0, MPI_MAX_INFO_VAL);
-    }
-} atl_mpi_comm_info_t;
-
-#define MPI_BFLOAT16 \
-    ({ \
-        CCL_THROW_IF_NOT(global_data.bf16.dtype != MPI_DATATYPE_NULL, \
-                         "unsupported datatype: ATL_DTYPE_BF16"); \
-        global_data.bf16.dtype; \
-    })
-
-#define MPI_FLOAT16 \
-    ({ \
-        CCL_THROW_IF_NOT(global_data.fp16.dtype != MPI_DATATYPE_NULL, \
-                         "unsupported datatype: ATL_DTYPE_FP16"); \
-        global_data.fp16.dtype; \
-    })
-
-// helpers: check contract
-static inline void atl_mpi_check_op_params(void* in_buf,
-                                           void* inout_buf,
-                                           int* length,
-                                           MPI_Datatype* datatype,
-                                           const char* caller_func_name) {
-    (void)datatype;
-    CCL_THROW_IF_NOT(in_buf && inout_buf && length,
-                     caller_func_name,
-                     " requested, bad arguments: ",
-                     in_buf,
-                     " ",
-                     inout_buf,
-                     " ",
-                     length);
-}
-
-static void atl_mpi_print_error(int error) __attribute__((unused));
-static void atl_mpi_print_error(int error) {
-    char str_error[MPI_MAX_ERROR_STRING];
-    int result_len = MPI_MAX_ERROR_STRING;
-
-    MPI_Error_string(error, str_error, &result_len);
-
-    if (result_len > MPI_MAX_ERROR_STRING) {
-        result_len = MPI_MAX_ERROR_STRING;
-    }
-    str_error[result_len - 1] = '\0';
-
-    ccl_logger::format(std::cout, "MPI error: %s (%d)", str_error, error);
-}
-
-#ifdef ATL_MPI_BF16
-
-static void BF16_INLINE_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_base_op(void* in,
-                                                                  void* inout,
-                                                                  int* length,
-                                                                  ccl::reduction op) {
-    unsigned short* in_buf = (unsigned short*)in;
-    unsigned short* inout_buf = (unsigned short*)inout;
-
-    size_t len = *length;
-    ccl_bf16_reduce_impl(in_buf, inout_buf, len, op);
-}
-
-static void BF16_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_sum_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_bf16_base_op(in, inout, length, ccl::reduction::sum);
-}
-
-static void BF16_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_prod_op(void* in,
-                                                           void* inout,
-                                                           int* length,
-                                                           MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_bf16_base_op(in, inout, length, ccl::reduction::prod);
-}
-
-static void BF16_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_min_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_bf16_base_op(in, inout, length, ccl::reduction::min);
-}
-
-static void BF16_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_max_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_bf16_base_op(in, inout, length, ccl::reduction::max);
-}
-#endif // ATL_MPI_BF16
-
-#ifdef ATL_MPI_FP16
-
-static void FP16_INLINE_TARGET_ATTRIBUTE_ALL atl_mpi_fp16_base_op(void* in,
-                                                                  void* inout,
-                                                                  int* length,
-                                                                  ccl::reduction op) {
-    unsigned short* in_buf = (unsigned short*)in;
-    unsigned short* inout_buf = (unsigned short*)inout;
-
-    size_t len = *length;
-    ccl_fp16_reduce_impl(in_buf, inout_buf, len, op);
-}
-
-static void FP16_TARGET_ATTRIBUTE_ALL atl_mpi_fp16_sum_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_fp16_base_op(in, inout, length, ccl::reduction::sum);
-}
-
-static void FP16_TARGET_ATTRIBUTE_ALL atl_mpi_fp16_prod_op(void* in,
-                                                           void* inout,
-                                                           int* length,
-                                                           MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_fp16_base_op(in, inout, length, ccl::reduction::prod);
-}
-
-static void FP16_TARGET_ATTRIBUTE_ALL atl_mpi_fp16_min_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_fp16_base_op(in, inout, length, ccl::reduction::min);
-}
-
-static void FP16_TARGET_ATTRIBUTE_ALL atl_mpi_fp16_max_op(void* in,
-                                                          void* inout,
-                                                          int* length,
-                                                          MPI_Datatype* datatype) {
-    atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
-    atl_mpi_fp16_base_op(in, inout, length, ccl::reduction::max);
-}
-#endif // ATL_MPI_FP16
-
-static int atl_mpi_bf16_init() {
-    int ret = MPI_SUCCESS;
-
-    if (ccl::global_data::env().bf16_impl_type <= ccl_bf16_no_hardware_support) {
-        return RET2ATL(ret);
-    }
-
-#ifdef ATL_MPI_BF16
-
-    // create custom MPI BF16 dtype
-    ret = MPI_Type_contiguous(2, MPI_BYTE, &global_data.bf16.dtype);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI BF16 dtype");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    ret = MPI_Type_commit(&global_data.bf16.dtype);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot commit MPI BF16 type");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI BF16 summation op
-    ret = MPI_Op_create(&atl_mpi_bf16_sum_op, 1, &global_data.bf16.sum_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI BF16 sum op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI BF16 production op
-    ret = MPI_Op_create(&atl_mpi_bf16_prod_op, 1, &global_data.bf16.prod_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI BF16 prod op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI BF16 min op
-    ret = MPI_Op_create(&atl_mpi_bf16_min_op, 1, &global_data.bf16.min_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI BF16 min op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI BF16 max op
-    ret = MPI_Op_create(&atl_mpi_bf16_max_op, 1, &global_data.bf16.max_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI BF16 max op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-#endif // ATL_MPI_BF16
-
-    return RET2ATL(ret);
-}
-
-static void atl_mpi_bf16_finalize() {
-    if (global_data.bf16.dtype != MPI_DATATYPE_NULL) {
-        MPI_Type_free(&global_data.bf16.dtype);
-    }
-
-    if (global_data.bf16.sum_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.bf16.sum_op);
-    }
-
-    if (global_data.bf16.prod_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.bf16.prod_op);
-    }
-
-    if (global_data.bf16.min_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.bf16.min_op);
-    }
-
-    if (global_data.bf16.max_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.bf16.max_op);
-    }
-}
-
-static int atl_mpi_fp16_init() {
-    int ret = MPI_SUCCESS;
-
-    if (ccl::global_data::env().fp16_impl_type <= ccl_fp16_no_hardware_support) {
-        return RET2ATL(ret);
-    }
-
-#ifdef ATL_MPI_FP16
-
-    // create custom MPI FP16 dtype
-    ret = MPI_Type_contiguous(2, MPI_BYTE, &global_data.fp16.dtype);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI FP16 dtype");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    ret = MPI_Type_commit(&global_data.fp16.dtype);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot commit MPI FP16 type");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI FP16 summation op
-    ret = MPI_Op_create(&atl_mpi_fp16_sum_op, 1, &global_data.fp16.sum_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI FP16 sum op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI FP16 production op
-    ret = MPI_Op_create(&atl_mpi_fp16_prod_op, 1, &global_data.fp16.prod_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI FP16 prod op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI FP16 min op
-    ret = MPI_Op_create(&atl_mpi_fp16_min_op, 1, &global_data.fp16.min_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI FP16 min op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-    // create custom MPI FP16 max op
-    ret = MPI_Op_create(&atl_mpi_fp16_max_op, 1, &global_data.fp16.max_op);
-    if (ret != MPI_SUCCESS) {
-        LOG_ERROR("cannot create MPI FP16 max op");
-        atl_mpi_print_error(ret);
-        return RET2ATL(ret);
-    }
-
-#endif // ATL_MPI_FP16
-
-    return RET2ATL(ret);
-}
-
-static void atl_mpi_fp16_finalize() {
-    if (global_data.fp16.dtype != MPI_DATATYPE_NULL) {
-        MPI_Type_free(&global_data.fp16.dtype);
-    }
-
-    if (global_data.fp16.sum_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.fp16.sum_op);
-    }
-
-    if (global_data.fp16.prod_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.fp16.prod_op);
-    }
-
-    if (global_data.fp16.min_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.fp16.min_op);
-    }
-
-    if (global_data.fp16.max_op != MPI_OP_NULL) {
-        MPI_Op_free(&global_data.fp16.max_op);
-    }
-}
-
-static MPI_Datatype atl2mpi_dtype(atl_datatype_t dtype) {
-    switch (dtype) {
-        case ATL_DTYPE_INT8: return MPI_CHAR;
-        case ATL_DTYPE_UINT8: return MPI_UNSIGNED_CHAR;
-        case ATL_DTYPE_INT16: return MPI_INT16_T;
-        case ATL_DTYPE_UINT16: return MPI_UINT16_T;
-        case ATL_DTYPE_INT32: return MPI_INT;
-        case ATL_DTYPE_UINT32: return MPI_UINT32_T;
-        case ATL_DTYPE_INT64: return MPI_LONG_LONG;
-        case ATL_DTYPE_UINT64: return MPI_UNSIGNED_LONG_LONG;
-        case ATL_DTYPE_FLOAT16: return MPI_FLOAT16;
-        case ATL_DTYPE_FLOAT32: return MPI_FLOAT;
-        case ATL_DTYPE_FLOAT64: return MPI_DOUBLE;
-        case ATL_DTYPE_BFLOAT16: return MPI_BFLOAT16;
-        default: printf("unknown datatype: %d\n", dtype); exit(1);
-    }
-}
-
-#ifdef ATL_MPI_BF16
-static MPI_Op atl2mpi_op_bf16(atl_reduction_t rtype) {
-    switch (rtype) {
-        case ATL_REDUCTION_SUM: return global_data.bf16.sum_op;
-        case ATL_REDUCTION_PROD: return global_data.bf16.prod_op;
-        case ATL_REDUCTION_MIN: return global_data.bf16.min_op;
-        case ATL_REDUCTION_MAX: return global_data.bf16.max_op;
-        default: printf("unknown reduction type: %d\n", rtype); exit(1);
-    }
-}
-#endif // ATL_MPI_BF16
-
-#ifdef ATL_MPI_FP16
-static MPI_Op atl2mpi_op_fp16(atl_reduction_t rtype) {
-    switch (rtype) {
-        case ATL_REDUCTION_SUM: return global_data.fp16.sum_op;
-        case ATL_REDUCTION_PROD: return global_data.fp16.prod_op;
-        case ATL_REDUCTION_MIN: return global_data.fp16.min_op;
-        case ATL_REDUCTION_MAX: return global_data.fp16.max_op;
-        default: printf("unknown reduction type: %d\n", rtype); exit(1);
-    }
-}
-#endif // ATL_MPI_FP16
-
-static MPI_Op atl2mpi_op(atl_reduction_t rtype, MPI_Datatype dtype) {
-#ifdef ATL_MPI_BF16
-    if (dtype == global_data.bf16.dtype)
-        return atl2mpi_op_bf16(rtype);
-#endif // ATL_MPI_BF16
-
-#ifdef ATL_MPI_FP16
-    if (dtype == global_data.fp16.dtype)
-        return atl2mpi_op_fp16(rtype);
-#endif // ATL_MPI_FP16
-
-    (void)dtype;
-    switch (rtype) {
-        case ATL_REDUCTION_SUM: return MPI_SUM;
-        case ATL_REDUCTION_PROD: return MPI_PROD;
-        case ATL_REDUCTION_MIN: return MPI_MIN;
-        case ATL_REDUCTION_MAX: return MPI_MAX;
-        default: printf("unknown reduction type: %d\n", rtype); exit(1);
-    }
-}
-
-atl_mpi_lib_attr_t atl_mpi_get_lib_attr() {
-    atl_mpi_lib_attr_t lib_attr = { ATL_MPI_LIB_NONE, 0 };
-
-    char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING] = { 0 };
-    int mpi_version_len = -1, i;
-    atl_mpi_lib_info_t* final_info = NULL;
-
-    /* can be called before MPI_Init */
-    int ret = MPI_Get_library_version(mpi_version, &mpi_version_len);
-
-    if ((ret != MPI_SUCCESS) || (mpi_version_len < 0) ||
-        (mpi_version_len > MPI_MAX_LIBRARY_VERSION_STRING)) {
-        LOG_WARN("can not retrieve MPI version, mpi_version_len ", mpi_version_len, ", ret", ret);
-        return lib_attr;
-    }
-
-    /* remove trailing spaces at the end for more compact log */
-    while (strlen(mpi_version) && isspace(mpi_version[strlen(mpi_version) - 1]))
-        mpi_version[strlen(mpi_version) - 1] = '\0';
-
-    LOG_DEBUG("MPI version: ", mpi_version);
-
-    /* for filtering */
-    char* lib_type_env = getenv("CCL_ATL_MPI");
-
-    for (i = 0; i < MPI_LIB_INFO_MAX_COUNT; i++) {
-        atl_mpi_lib_info_t* info = &(mpi_lib_infos[i]);
-
-        if (info->type == ATL_MPI_LIB_NONE)
-            continue;
-
-        if (lib_type_env) {
-            if (strcmp(lib_type_env, info->name)) {
-                LOG_DEBUG("library ", info->name, " is filtered out by user input ", lib_type_env);
-                continue;
-            }
-            else {
-                LOG_DEBUG("use lib_type = ", lib_type_env, " because it is requested explicitly");
-            }
-        }
-
-        CCL_THROW_IF_NOT(info->version_prefix_1, "empty version_prefix_1");
-        CCL_THROW_IF_NOT(info->min_version_value >= 0, "unexpected minimal version");
-
-        const char* version_substr = NULL;
-        if ((version_substr = strstr(mpi_version, info->version_prefix_1))) {
-            version_substr += strlen(info->version_prefix_1);
-            LOG_DEBUG("version_substr: ", version_substr);
-
-            if (info->version_prefix_2) {
-                version_substr = strstr(version_substr, info->version_prefix_2);
-                if (!version_substr) {
-                    LOG_DEBUG("can't find version_prefix_2 ", info->version_prefix_2);
-                    continue;
-                }
-                version_substr += strlen(info->version_prefix_2);
-                LOG_DEBUG("version_substr: ", version_substr);
-            }
-
-            int version_value = (version_substr) ? atoi(version_substr) : -1;
-            LOG_DEBUG("MPI numerical version: ", version_value);
-
-            if (version_value < info->min_version_value) {
-                LOG_WARN("loaded MPI doesn't match with expected version, "
-                         "consider to switch to ",
-                         info->version_prefix_1,
-                         " ",
-                         (info->version_prefix_2 ? info->version_prefix_2 : ""),
-                         info->min_version_value,
-                         " (min) ",
-                         (info->kind_value ? info->kind_value : ""),
-                         "\n");
-                continue;
-            }
-
-            if (info->kind_prefix && info->kind_value) {
-                const char* kind_substr = mpi_version;
-
-                if ((kind_substr = strstr(kind_substr, info->kind_prefix))) {
-                    kind_substr += strlen(info->kind_prefix);
-                    while ((isspace(*kind_substr)) &&
-                           (kind_substr < (mpi_version + mpi_version_len)))
-                        kind_substr++;
-
-                    LOG_DEBUG("kind_substr: ", kind_substr);
-
-                    if (strncmp(kind_substr, info->kind_value, strlen(info->kind_value))) {
-                        LOG_WARN("loaded MPI version (",
-                                 version_value,
-                                 ") ",
-                                 "is higher or equal to minimal expected version (",
-                                 info->min_version_value,
-                                 ") ",
-                                 "but kind (",
-                                 kind_substr,
-                                 ") doesn't match with expected kind (",
-                                 info->kind_value,
-                                 "), "
-                                 "consider to switch to ",
-                                 info->version_prefix_1,
-                                 " ",
-                                 (info->version_prefix_2 ? info->version_prefix_2 : ""),
-                                 info->min_version_value,
-                                 " (min version) ",
-                                 (info->kind_value ? info->kind_value : ""),
-                                 "\n");
-                    }
-                }
-                else {
-                    LOG_DEBUG("MPI version is high enough, but kind_prefix (",
-                              info->kind_prefix,
-                              ") can not be found",
-                              " treat this like expected kind (",
-                              info->kind_value,
-                              ") was found");
-                }
-            }
-
-            final_info = info;
-            LOG_DEBUG("set lib_type = ",
-                      info->name,
-                      " because "
-                      "version (",
-                      version_value,
-                      ") is higher or equal to minimal expected version (",
-                      info->min_version_value,
-                      ")");
-
-            lib_attr.type = final_info->type;
-            lib_attr.hmem = (final_info->min_hmem_version_value >= version_value) ? 1 : 0;
-
-            break;
-        }
-    }
-
-    if (final_info) {
-        LOG_DEBUG("MPI library type: ", final_info->name);
-    }
-    else {
-        LOG_DEBUG("MPI library type: none");
-    }
-
-    return lib_attr;
-}
-
-size_t atl_mpi_get_ep_count(const atl_attr_t& attr) {
-    size_t mpi_ep_count = attr.in.ep_count;
-    if (attr.in.enable_extra_ep)
-        mpi_ep_count += attr.in.enable_extra_ep;
-    return mpi_ep_count;
-}
-
-size_t atl_mpi_get_ep_idx(size_t ep_idx) {
-    size_t mpi_ep_idx = ep_idx;
-    if (global_data.extra_ep)
-        mpi_ep_idx += global_data.extra_ep;
-    return mpi_ep_idx;
-}
-
-/* set these knobs without detection of MPI library type */
-atl_status_t atl_mpi_set_base_env(const atl_attr_t& attr) {
-    setenv("PSM2_MULTI_EP", "1", 0);
-    setenv("FI_OFI_RXM_USE_HASH", "0", 0);
-
-#ifdef CCL_ENABLE_SYCL
-    setenv("FI_SHM_DISABLE_CMA", "1", 0);
-#endif // CCL_ENABLE_SYCL
-
-    setenv("MPIR_CVAR_DEFAULT_THREAD_LEVEL", "MPI_THREAD_MULTIPLE", 0);
-
-    /* request IMPI level append library kind into MPI_Get_library_version output */
-    setenv("I_MPI_INFO_LIBRARY_KIND", "1", 0);
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t atl_mpi_set_impi_env(const atl_attr_t& attr, const atl_mpi_lib_attr_t& lib_attr) {
-    char ep_count_str[MPI_MAX_INFO_VAL] = { 0 };
-    snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", atl_mpi_get_ep_count(attr));
-
-    if (attr.in.ep_count)
-        setenv("I_MPI_OFI_ISEND_INJECT_THRESHOLD", "0", 0);
-
-#ifdef CCL_ENABLE_SYCL
-    setenv("I_MPI_SHM_CMA", "0", 0);
-    if (attr.in.enable_hmem && lib_attr.hmem) {
-        setenv("I_MPI_OFFLOAD", "2", 0);
-        setenv("I_MPI_OFFLOAD_TOPOLIB", "l0", 0);
-        setenv("I_MPI_OFFLOAD_QUEUE_CACHE", "1", 0);
-        setenv("I_MPI_OFFLOAD_LIST_CACHE", "1", 0);
-        setenv("I_MPI_OFFLOAD_MEMCPY_KIND", "blocked", 0);
-        if (attr.in.ep_count > 1) {
-            /* try to set global lock level before vci level
-               because setenv is invoked with overwrite=0 */
-            setenv("I_MPI_THREAD_LOCK_LEVEL", "global", 0);
-        }
-    }
-#endif // CCL_ENABLE_SYCL
-
-    setenv("I_MPI_THREAD_SPLIT", "1", 0);
-    setenv("I_MPI_THREAD_RUNTIME", "generic", 0);
-    setenv("I_MPI_THREAD_MAX", ep_count_str, 0);
-    setenv("I_MPI_THREAD_ID_KEY", EP_IDX_KEY, 0);
-    setenv("I_MPI_THREAD_LOCK_LEVEL", "vci", 0);
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t atl_mpi_check_impi_env(const atl_attr_t& attr) {
-    char* ep_count_env = getenv("I_MPI_THREAD_MAX");
-    if (!ep_count_env)
-        return ATL_STATUS_FAILURE;
-    if (atoi(ep_count_env) != (int)(atl_mpi_get_ep_count(attr)))
-        return ATL_STATUS_FAILURE;
-
-    if (!getenv("I_MPI_ROOT")) {
-        atl_mpi_lib_type_t type = ATL_MPI_LIB_IMPI;
-        LOG_ERROR("CCL/MPI uses ",
-                  mpi_lib_infos[type].version_prefix_1,
-                  " but I_MPI_ROOT is not set. ",
-                  "Please source ",
-                  mpi_lib_infos[type].kind_value,
-                  " version of ",
-                  mpi_lib_infos[type].version_prefix_1,
-                  " (",
-                  mpi_lib_infos[type].min_version_value,
-                  " or higher version).");
-        return ATL_STATUS_FAILURE;
-    }
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t atl_mpi_set_mpich_env(const atl_attr_t& attr) {
-    char ep_count_str[MPI_MAX_INFO_VAL] = { 0 };
-    snprintf(ep_count_str, MPI_MAX_INFO_VAL, "%zu", atl_mpi_get_ep_count(attr));
-
-    setenv("MPIR_CVAR_CH4_MT_MODEL", "direct", 0);
-    setenv("MPIR_CVAR_CH4_NUM_VCIS", ep_count_str, 0);
-    setenv("MPIR_CVAR_CH4_OFI_MAX_VCIS", ep_count_str, 0);
-    setenv("MPIR_CVAR_CH4_ASYNC_PROGRESS_ID_KEY", EP_IDX_KEY, 0);
-    setenv("MPIR_CVAR_CH4_OFI_ENABLE_SCALABLE_ENDPOINTS", "1", 0);
-
-    if (attr.in.mnic_type != ATL_MNIC_NONE) {
-        setenv("MPIR_CVAR_CH4_OFI_ENABLE_NIC_SELECTION", "1", 0);
-        auto& env = ccl::global_data::env();
-        if (env.log_level >= ccl_log_level::info) {
-            setenv("MPIR_CVAR_CH4_OFI_DUMP_NIC_SETTINGS", "1", 0);
-        }
-    }
-
-    setenv("FI_PSM2_DELAY", "0", 0);
-    setenv("FI_PSM2_TIMEOUT", "0", 0);
-    setenv("FI_PSM2_NAME_SERVER", "0", 0);
-    setenv("HFI_NO_CPUAFFINITY", "1", 0);
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t atl_mpi_check_mpich_env(const atl_attr_t& attr) {
-    char* ep_count_env = getenv("MPIR_CVAR_CH4_OFI_MAX_VCIS");
-    if (!ep_count_env)
-        return ATL_STATUS_FAILURE;
-    if (atoi(ep_count_env) != (int)(atl_mpi_get_ep_count(attr)))
-        return ATL_STATUS_FAILURE;
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t atl_mpi_set_env(const atl_attr_t& attr) {
-    if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_NONE) {
-        /* library type was already detected and env was set, make sanity check */
-        if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
-            return atl_mpi_check_impi_env(attr);
-        }
-        else if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
-            return atl_mpi_check_mpich_env(attr);
-        }
-        return ATL_STATUS_SUCCESS;
-    }
-
-    atl_mpi_set_base_env(attr);
-
-    atl_mpi_lib_attr_t mpi_lib_attr = atl_mpi_get_lib_attr();
-
-    if (mpi_lib_attr.type == ATL_MPI_LIB_NONE) {
-        return ATL_STATUS_SUCCESS;
-    }
-
-    if (mpi_lib_attr.type == ATL_MPI_LIB_IMPI) {
-        atl_mpi_set_impi_env(attr, mpi_lib_attr);
-        atl_mpi_check_impi_env(attr);
-    }
-    else if (mpi_lib_attr.type == ATL_MPI_LIB_MPICH) {
-        atl_mpi_set_mpich_env(attr);
-        atl_mpi_check_mpich_env(attr);
-    }
-
-    int is_mpi_inited = 0;
-    MPI_Initialized(&is_mpi_inited);
-    if (is_mpi_inited) {
-        LOG_WARN("MPI was initialized externally, CCL-MPI specific environment is ignored");
-    }
-    else {
-        LOG_DEBUG("set CCL-MPI specific environment");
-    }
-
-    global_data.mpi_lib_attr = mpi_lib_attr;
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_mpi_comm_info_t atl_mpi_get_comm_info(MPI_Comm comm, const char* key) {
-    MPI_Info info;
-    atl_mpi_comm_info_t res;
-    res.comm = comm;
-    snprintf(res.key, MPI_MAX_INFO_KEY, "%s", key);
-
-    MPI_Comm_get_info(res.comm, &info);
-    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, res.value, &res.found);
-    MPI_Info_free(&info);
-
-    return res;
-}
-
-size_t atl_mpi_get_nic_count(const char* nic_count_key) {
-    size_t count = 1;
-    atl_mpi_comm_info_t info = atl_mpi_get_comm_info(MPI_COMM_WORLD, nic_count_key);
-    CCL_THROW_IF_NOT(info.found, "MPI comm key ", nic_count_key, " was not set");
-
-    count = atoi(info.value);
-    if (count <= 0) {
-        count = 1;
-    }
-
-    return count;
-}
-
-void atl_mpi_check_comm_info(MPI_Comm comm, const char* key, const char* expected_value) {
-    atl_mpi_comm_info_t info = atl_mpi_get_comm_info(comm, key);
-
-    CCL_THROW_IF_NOT(info.found, "MPI comm key ", key, " was not set");
-    CCL_THROW_IF_NOT(!strcmp(info.value, expected_value),
-                     "MPI comm key ",
-                     key,
-                     ": expected: ",
-                     expected_value,
-                     ", read: ",
-                     info.value);
-}
-
-void atl_mpi_check_comm_ep_idx(MPI_Comm comm, size_t expected_idx) {
-    if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE)
-        return;
-
-    char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
-    snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx);
-    atl_mpi_check_comm_info(comm, EP_IDX_KEY, expected_idx_str);
-}
-
-void atl_mpi_check_comm_nic_idx(MPI_Comm comm, size_t expected_idx, const char* nic_idx_key) {
-    char expected_idx_str[MPI_MAX_INFO_VAL] = { 0 };
-    snprintf(expected_idx_str, MPI_MAX_INFO_VAL, "%zu", expected_idx);
-    atl_mpi_check_comm_info(comm, nic_idx_key, expected_idx_str);
-}
-
-#ifdef ENABLE_DEBUG
-inline void atl_mpi_check_ep(atl_ep_t* ep) {
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, atl_mpi_get_ep_idx(ep->idx));
-}
-#else
-#define atl_mpi_check_ep(ep)
-#endif
-
-static atl_status_t atl_mpi_finalize(atl_ctx_t* ctx) {
-    int ret = MPI_SUCCESS;
-    atl_mpi_ctx_t* mpi_ctx = container_of(ctx, atl_mpi_ctx_t, ctx);
-    atl_ep_t** eps = ctx->eps;
-
-    global_data.ctx_count--;
-    if (ctx->coord.global_idx == 0) {
-        LOG_INFO("finalize atl-mpi ctx, remaining ctx_count ", global_data.ctx_count);
-    }
-
-    int is_mpi_finalized = 0;
-    MPI_Finalized(&is_mpi_finalized);
-
-    if (!is_mpi_finalized) {
-        for (size_t i = 0; i < ctx->ep_count; i++) {
-            atl_mpi_ep_t* mpi_ep = container_of(eps[i], atl_mpi_ep_t, ep);
-
-            if (mpi_ep) {
-                if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) {
-                    MPI_Cancel(&(mpi_ep->dummy_req.native_req));
-                    MPI_Comm_free(&mpi_ep->dummy_comm);
-                }
-                MPI_Comm_free(&mpi_ep->mpi_comm);
-                free(mpi_ep);
-            }
-        }
-
-        if (global_data.ctx_count == 0) {
-            atl_mpi_bf16_finalize();
-            atl_mpi_fp16_finalize();
-            if (!global_data.is_external_init) {
-                ret = MPI_Finalize();
-            }
-            else {
-                LOG_DEBUG("MPI_Init has been called externally, skip MPI_Finalize");
-            }
-
-            if (ctx->coord.global_idx == 0) {
-                LOG_INFO("finalized last atl-mpi ctx");
-            }
-        }
-    }
-    else {
-        for (size_t i = 0; i < ctx->ep_count; i++) {
-            atl_mpi_ep_t* mpi_ep = container_of(eps[i], atl_mpi_ep_t, ep);
-            free(mpi_ep);
-        }
-        if ((global_data.ctx_count == 0) && (ctx->coord.global_idx == 0)) {
-            LOG_WARN("MPI_Finalize has been called before CCL finalization");
-        }
-    }
-
-    free(eps);
-    free(mpi_ctx);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_mr_reg(atl_ctx_t* ctx, const void* buf, size_t len, atl_mr_t** mr) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-static atl_status_t atl_mpi_mr_dereg(atl_ctx_t* ctx, atl_mr_t* mr) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-static atl_status_t atl_mpi_ep_send(atl_ep_t* ep,
-                                    const void* buf,
-                                    size_t len,
-                                    int dst_proc_idx,
-                                    uint64_t tag,
-                                    atl_req_t* req) {
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-
-    int ret = MPI_Isend(
-        buf, len, MPI_CHAR, dst_proc_idx, (int)tag, mpi_ep->mpi_comm, &mpi_req->native_req);
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_recv(atl_ep_t* ep,
-                                    void* buf,
-                                    size_t len,
-                                    int src_proc_idx,
-                                    uint64_t tag,
-                                    atl_req_t* req) {
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-
-    int ret = MPI_Irecv(
-        buf, len, MPI_CHAR, src_proc_idx, (int)tag, mpi_ep->mpi_comm, &mpi_req->native_req);
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_probe(atl_ep_t* ep,
-                                     int src_proc_idx,
-                                     uint64_t tag,
-                                     int* found,
-                                     size_t* recv_len) {
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-
-    int flag = 0, len = 0, ret;
-    MPI_Status status;
-
-    ret = MPI_Iprobe(src_proc_idx, tag, mpi_ep->mpi_comm, &flag, &status);
-    if (flag) {
-        MPI_Get_count(&status, MPI_BYTE, &len);
-    }
-
-    if (found)
-        *found = flag;
-    if (recv_len)
-        *recv_len = len;
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_allgatherv(atl_ep_t* ep,
-                                          const void* send_buf,
-                                          size_t send_len,
-                                          void* recv_buf,
-                                          const int* recv_lens,
-                                          const int* offsets,
-                                          atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Allgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                             send_len,
-                             MPI_CHAR,
-                             recv_buf,
-                             recv_lens,
-                             offsets,
-                             MPI_CHAR,
-                             mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Iallgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                              send_len,
-                              MPI_CHAR,
-                              recv_buf,
-                              recv_lens,
-                              offsets,
-                              MPI_CHAR,
-                              mpi_ep->mpi_comm,
-                              &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_allreduce(atl_ep_t* ep,
-                                         const void* send_buf,
-                                         void* recv_buf,
-                                         size_t count,
-                                         atl_datatype_t dtype,
-                                         atl_reduction_t op,
-                                         atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
-    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Allreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                            recv_buf,
-                            count,
-                            mpi_dtype,
-                            mpi_op,
-                            mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        //printf("atl_mpi: send_buf %p, recv_buf %p\n", send_buf, recv_buf);
-        ret = MPI_Iallreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                             recv_buf,
-                             count,
-                             mpi_dtype,
-                             mpi_op,
-                             mpi_ep->mpi_comm,
-                             &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_alltoall(atl_ep_t* ep,
-                                        const void* send_buf,
-                                        void* recv_buf,
-                                        size_t len,
-                                        atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Alltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                           len,
-                           MPI_CHAR,
-                           recv_buf,
-                           len,
-                           MPI_CHAR,
-                           mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ialltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                            len,
-                            MPI_CHAR,
-                            recv_buf,
-                            len,
-                            MPI_CHAR,
-                            mpi_ep->mpi_comm,
-                            &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_alltoallv(atl_ep_t* ep,
-                                         const void* send_buf,
-                                         const int* send_lens,
-                                         const int* send_offsets,
-                                         void* recv_buf,
-                                         const int* recv_lens,
-                                         const int* recv_offsets,
-                                         atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Alltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                            send_lens,
-                            send_offsets,
-                            MPI_CHAR,
-                            recv_buf,
-                            recv_lens,
-                            recv_offsets,
-                            MPI_CHAR,
-                            mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ialltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                             send_lens,
-                             send_offsets,
-                             MPI_CHAR,
-                             recv_buf,
-                             recv_lens,
-                             recv_offsets,
-                             MPI_CHAR,
-                             mpi_ep->mpi_comm,
-                             &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Barrier(mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ibarrier(mpi_ep->mpi_comm, &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_bcast(atl_ep_t* ep,
-                                     void* buf,
-                                     size_t len,
-                                     int root,
-                                     atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Bcast(buf, len, MPI_CHAR, root, mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ibcast(buf, len, MPI_CHAR, root, mpi_ep->mpi_comm, &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_reduce(atl_ep_t* ep,
-                                      const void* send_buf,
-                                      void* recv_buf,
-                                      size_t count,
-                                      int root,
-                                      atl_datatype_t dtype,
-                                      atl_reduction_t op,
-                                      atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    int my_proc_idx = ep->ctx->coord.global_idx;
-    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
-    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
-
-    if (mpi_ctx->sync_coll) {
-        ret = MPI_Reduce(
-            (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
-            recv_buf,
-            count,
-            mpi_dtype,
-            mpi_op,
-            root,
-            mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ireduce(
-            (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
-            recv_buf,
-            count,
-            mpi_dtype,
-            mpi_op,
-            root,
-            mpi_ep->mpi_comm,
-            &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_reduce_scatter(atl_ep_t* ep,
-                                              const void* send_buf,
-                                              void* recv_buf,
-                                              size_t recv_count,
-                                              atl_datatype_t dtype,
-                                              atl_reduction_t op,
-                                              atl_req_t* req) {
-    int ret = MPI_SUCCESS;
-
-    atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-
-    MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
-    MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
-
-    if (mpi_ctx->sync_coll) {
-        ret =
-            MPI_Reduce_scatter_block((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-                                     recv_buf,
-                                     recv_count,
-                                     mpi_dtype,
-                                     mpi_op,
-                                     mpi_ep->mpi_comm);
-        mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-        mpi_req->native_req = MPI_REQUEST_NULL;
-    }
-    else {
-        ret = MPI_Ireduce_scatter_block(
-            (send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-            recv_buf,
-            recv_count,
-            mpi_dtype,
-            mpi_op,
-            mpi_ep->mpi_comm,
-            &mpi_req->native_req);
-        mpi_req->comp_state = ATL_MPI_COMP_POSTED;
-    }
-
-    atl_mpi_check_ep(ep);
-
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_read(atl_ep_t* ep,
-                                    void* buf,
-                                    size_t len,
-                                    atl_mr_t* mr,
-                                    uint64_t addr,
-                                    uintptr_t r_key,
-                                    int dst_proc_idx,
-                                    atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-static atl_status_t atl_mpi_ep_write(atl_ep_t* ep,
-                                     const void* buf,
-                                     size_t len,
-                                     atl_mr_t* mr,
-                                     uint64_t addr,
-                                     uintptr_t r_key,
-                                     int dst_proc_idx,
-                                     atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-static atl_status_t atl_mpi_ep_wait(atl_ep_t* ep, atl_req_t* req) {
-    int ret;
-    MPI_Status status;
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-    ret = MPI_Wait(&mpi_req->native_req, &status);
-    mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_ep_wait_all(atl_ep_t* ep, atl_req_t* reqs, size_t count) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-static inline atl_status_t atl_mpi_ep_progress(atl_ep_t* ep, atl_mpi_req_t* req) {
-    int flag = 0;
-    int ret = MPI_Test(&req->native_req, &flag, MPI_STATUS_IGNORE);
-
-    if (flag) {
-        req->comp_state = ATL_MPI_COMP_COMPLETED;
-    }
-
-    return RET2ATL(ret);
-}
-
-static inline atl_status_t atl_mpi_ep_poll(atl_ep_t* ep) {
-    atl_mpi_ctx_t* mpi_ctx = container_of(ep->ctx, atl_mpi_ctx_t, ctx);
-    if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) {
-        atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
-        atl_mpi_ep_progress(ep, &(mpi_ep->dummy_req));
-    }
-
-    return ATL_STATUS_SUCCESS;
-}
-
-static atl_status_t atl_mpi_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) {
-    CCL_THROW_IF_NOT(is_completed);
-
-    atl_status_t status = ATL_STATUS_SUCCESS;
-
-    atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
-
-    *is_completed = (mpi_req->comp_state == ATL_MPI_COMP_COMPLETED);
-    if (*is_completed) {
-        return ATL_STATUS_SUCCESS;
-    }
-
-    status = atl_mpi_ep_progress(ep, mpi_req);
-    *is_completed = (mpi_req->comp_state == ATL_MPI_COMP_COMPLETED);
-
-    return status;
-}
-
-static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t** ep) {
-    int ret;
-
-    ssize_t mpi_ep_idx = atl_mpi_get_ep_idx(idx);
-    char mpi_ep_idx_str[MPI_MAX_INFO_VAL] = { 0 };
-
-    size_t nic_idx = 0;
-    char nic_idx_str[MPI_MAX_INFO_VAL] = { 0 };
-    const char* nic_idx_key =
-        (global_data.mnic_type == ATL_MNIC_GLOBAL) ? GLOBAL_NIC_IDX_KEY : LOCAL_NIC_IDX_KEY;
-
-    atl_mpi_ep_t* mpi_ep = (atl_mpi_ep_t*)calloc(1, sizeof(atl_mpi_ep_t));
-    if (!mpi_ep)
-        return ATL_STATUS_FAILURE;
-
-    ret = MPI_Comm_dup(MPI_COMM_WORLD, &mpi_ep->mpi_comm);
-    if (ret)
-        goto err_ep_dup;
-
-    MPI_Info info;
-    MPI_Info_create(&info);
-
-    /* set EP index */
-    snprintf(mpi_ep_idx_str, MPI_MAX_INFO_VAL, "%zu", mpi_ep_idx);
-    MPI_Info_set(info, EP_IDX_KEY, mpi_ep_idx_str);
-
-    if (global_data.mnic_type != ATL_MNIC_NONE) {
-        /* set NIC index */
-        nic_idx = (idx % global_data.mnic_count);
-        snprintf(nic_idx_str, MPI_MAX_INFO_VAL, "%zu", nic_idx);
-        MPI_Info_set(info, nic_idx_key, nic_idx_str);
-    }
-
-    MPI_Comm_set_info(mpi_ep->mpi_comm, info);
-
-    if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) {
-        ret = MPI_Comm_dup(MPI_COMM_WORLD, &mpi_ep->dummy_comm);
-        if (ret)
-            goto err_ep_dup;
-        MPI_Comm_set_info(mpi_ep->dummy_comm, info);
-        MPI_Irecv(NULL, 0, MPI_CHAR, 0, 0, mpi_ep->dummy_comm, &(mpi_ep->dummy_req.native_req));
-
-        atl_mpi_check_comm_ep_idx(mpi_ep->dummy_comm, mpi_ep_idx);
-        if (global_data.mnic_type != ATL_MNIC_NONE) {
-            atl_mpi_check_comm_nic_idx(mpi_ep->dummy_comm, nic_idx, nic_idx_key);
-        }
-    }
-
-    MPI_Info_free(&info);
-
-    atl_mpi_check_comm_ep_idx(mpi_ep->mpi_comm, mpi_ep_idx);
-    if (global_data.mnic_type != ATL_MNIC_NONE) {
-        atl_mpi_check_comm_nic_idx(mpi_ep->mpi_comm, nic_idx, nic_idx_key);
-    }
-
-    LOG_DEBUG("atl-mpi-ep: ", idx, ", ep_idx ", mpi_ep_idx, ", nic_idx ", nic_idx);
-
-    *ep = &mpi_ep->ep;
-    (*ep)->idx = idx;
-    (*ep)->ctx = &mpi_ctx->ctx;
-
-    return ATL_STATUS_SUCCESS;
-
-err_ep_dup:
-    free(mpi_ep);
-    return RET2ATL(ret);
-}
-
-static atl_status_t atl_mpi_init(int* argc,
-                                 char*** argv,
-                                 atl_attr_t* attr,
-                                 atl_ctx_t** out_ctx,
-                                 const char* main_addr,
-                                 ipmi* pmi) {
-    CCL_THROW_IF_NOT((sizeof(atl_mpi_req_t) <= sizeof(atl_req_t) - offsetof(atl_req_t, internal)),
-                     "unexpected offset: atl_mpi_request size ",
-                     sizeof(atl_mpi_req_t),
-                     ", atl_request size ",
-                     sizeof(atl_req_t),
-                     ", expected offset ",
-                     offsetof(atl_req_t, internal));
-
-    int ret = MPI_SUCCESS;
-    size_t i;
-    int is_tag_ub_set = 0;
-    void* tag_ub_ptr = NULL;
-    int required_thread_level = MPI_THREAD_MULTIPLE, provided_thread_level;
-
-    char my_hostname[ATL_MAX_HOSTNAME_LEN] = { 0 };
-
-    atl_mpi_ctx_t* mpi_ctx = (atl_mpi_ctx_t*)calloc(1, sizeof(atl_mpi_ctx_t));
-    if (!mpi_ctx)
-        return ATL_STATUS_FAILURE;
-
-    atl_ctx_t* ctx = &(mpi_ctx->ctx);
-
-    if (global_data.ctx_count == 0) {
-        if (atl_mpi_set_env(*attr)) {
-            goto err_init;
-        }
-
-        MPI_Initialized(&global_data.is_external_init);
-
-        if (!global_data.is_external_init) {
-            ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level);
-            if (provided_thread_level < required_thread_level) {
-                LOG_ERROR("unexpected MPI thread level: required ",
-                          required_thread_level,
-                          ", provided ",
-                          provided_thread_level);
-                goto err_init;
-            }
-        }
-        else {
-            LOG_DEBUG("MPI was initialized externaly");
-            MPI_Query_thread(&provided_thread_level);
-            if (provided_thread_level < required_thread_level) {
-                LOG_WARN("MPI was initialized externaly but with unexpected thread level: "
-                         "required ",
-                         required_thread_level,
-                         ", provided ",
-                         provided_thread_level);
-            }
-        }
-
-        if (ret)
-            goto err_init;
-
-        if (global_data.mpi_lib_attr.type == ATL_MPI_LIB_NONE)
-            global_data.mpi_lib_attr = atl_mpi_get_lib_attr();
-
-        global_data.extra_ep = attr->in.enable_extra_ep;
-
-        global_data.mnic_type = attr->in.mnic_type;
-        if (global_data.mpi_lib_attr.type != ATL_MPI_LIB_MPICH) {
-            /* only MPICH supports multi-NIC */
-            global_data.mnic_type = ATL_MNIC_NONE;
-        }
-
-        if (global_data.mnic_type == ATL_MNIC_LOCAL) {
-            global_data.mnic_count = atl_mpi_get_nic_count(LOCAL_NIC_COUNT_KEY);
-        }
-        else if (global_data.mnic_type == ATL_MNIC_GLOBAL) {
-            global_data.mnic_count = atl_mpi_get_nic_count(GLOBAL_NIC_IDX_KEY);
-        }
-        else if (global_data.mnic_type == ATL_MNIC_NONE) {
-            global_data.mnic_count = 1;
-        }
-        global_data.mnic_count = std::min(global_data.mnic_count, attr->in.mnic_count);
-        global_data.mnic_count = std::min(global_data.mnic_count, attr->in.ep_count);
-        global_data.mnic_count = std::max(global_data.mnic_count, (size_t)(1));
-
-        if (atl_mpi_bf16_init() == ATL_STATUS_FAILURE) {
-            atl_mpi_bf16_finalize();
-            goto err_init;
-        }
-
-        if (atl_mpi_fp16_init() == ATL_STATUS_FAILURE) {
-            atl_mpi_fp16_finalize();
-            goto err_init;
-        }
-    }
-    global_data.ctx_count++;
-
-    atl_proc_coord_t* coord;
-    coord = &(ctx->coord);
-
-    MPI_Comm_rank(MPI_COMM_WORLD, (int*)&(coord->global_idx));
-    MPI_Comm_size(MPI_COMM_WORLD, (int*)&(coord->global_count));
-
-    MPI_Comm local_comm;
-    MPI_Comm_split_type(
-        MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, coord->global_count, MPI_INFO_NULL, &local_comm);
-    MPI_Comm_rank(local_comm, (int*)&(coord->local_idx));
-    MPI_Comm_size(local_comm, (int*)&(coord->local_count));
-    MPI_Comm_free(&local_comm);
-
-    gethostname(my_hostname, ATL_MAX_HOSTNAME_LEN - 1);
-    coord->hostname_hash = std::hash<std::string>{}(my_hostname);
-
-    ctx->ep_count = attr->in.ep_count;
-    ctx->eps = (atl_ep_t**)calloc(1, sizeof(void*) * attr->in.ep_count);
-    if (!ctx->eps)
-        goto err_after_init;
-
-    char* progress_mode_env;
-    progress_mode_env = getenv(ATL_PROGRESS_MODE_ENV);
-    if (progress_mode_env) {
-        mpi_ctx->progress_mode = (atl_progress_mode_t)atoi(progress_mode_env);
-    }
-    else {
-        mpi_ctx->progress_mode = ATL_PROGRESS_CHECK;
-    }
-    mpi_ctx->sync_coll = attr->in.enable_sync_coll;
-
-    if (coord->global_idx == 0) {
-        if (global_data.ctx_count == 1) {
-            LOG_INFO("atl-mpi-global:")
-            LOG_INFO("  is_external_init: ", global_data.is_external_init);
-            LOG_INFO("  mpi_lib_attr.type: ", mpi_lib_infos[global_data.mpi_lib_attr.type].name);
-            LOG_INFO("  mpi_lib_attr.hmem: ", global_data.mpi_lib_attr.hmem);
-            LOG_INFO("  extra_ep: ", global_data.extra_ep);
-            LOG_INFO("  mnic_type: ", global_data.mnic_type);
-            if (global_data.mnic_type != ATL_MNIC_NONE)
-                LOG_INFO("  mnic_count: ", global_data.mnic_count);
-        }
-        LOG_INFO("atl-mpi-ctx: ", (global_data.ctx_count - 1));
-        LOG_INFO("  progress_mode: ", mpi_ctx->progress_mode);
-        LOG_INFO("  sync_coll: ", mpi_ctx->sync_coll);
-    }
-
-    for (i = 0; i < attr->in.ep_count; i++) {
-        ret = atl_mpi_ep_init(mpi_ctx, i, &(ctx->eps[i]));
-        if (ret)
-            goto err_ep_dup;
-    }
-
-    *out_ctx = &mpi_ctx->ctx;
-
-    MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub_ptr, &is_tag_ub_set);
-
-    /* report actual attributes back to upper level */
-    attr->out.enable_shm = 0;
-    attr->out.enable_rma = 0;
-    attr->out.enable_hmem = attr->in.enable_hmem & global_data.mpi_lib_attr.hmem;
-    attr->out.mnic_type = global_data.mnic_type;
-    attr->out.mnic_count = global_data.mnic_count;
-    attr->out.tag_bits = 32;
-    attr->out.max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0;
-    attr->out.max_order_waw_size = 0;
-
-    return ATL_STATUS_SUCCESS;
-
-err_ep_dup:
-    for (i = 0; i < attr->in.ep_count; i++) {
-        atl_mpi_ep_t* mpi_ep = container_of(ctx->eps[i], atl_mpi_ep_t, ep);
-
-        if (ctx->eps[i] && mpi_ep) {
-            if (mpi_ctx->progress_mode == ATL_PROGRESS_POLL) {
-                MPI_Cancel(&(mpi_ep->dummy_req.native_req));
-                MPI_Comm_free(&mpi_ep->dummy_comm);
-            }
-            MPI_Comm_free(&mpi_ep->mpi_comm);
-        }
-    }
-    free(ctx->eps);
-
-err_after_init:
-    global_data.ctx_count--;
-    if (global_data.ctx_count == 0) {
-        atl_mpi_bf16_finalize();
-        atl_mpi_fp16_finalize();
-        if (!global_data.is_external_init) {
-            MPI_Finalize();
-        }
-    }
-
-err_init:
-    free(mpi_ctx);
-    return ATL_STATUS_FAILURE;
-}
-
-atl_status_t atl_mpi_main_addr_reserve(char* main_addr) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-#endif // CCL_ENABLE_MPI
diff --git a/src/atl/ofi/atl_ofi.cpp b/src/atl/ofi/atl_ofi.cpp
index 21bc2cb6f..bfe0a6fce 100644
--- a/src/atl/ofi/atl_ofi.cpp
+++ b/src/atl/ofi/atl_ofi.cpp
@@ -82,8 +82,11 @@ void atl_ofi::mr_cache::get(fid_domain* domain, void* buf, size_t bytes, fid_mr*
         }
     }
 
-    struct fi_mr_attr mr_attr = {};
-    struct iovec iov = {};
+    struct fi_mr_attr mr_attr;
+    struct iovec iov;
+
+    memset(&mr_attr, 0, sizeof(mr_attr));
+    memset(&iov, 0, sizeof(iov));
 
     iov.iov_base = buf;
     iov.iov_len = bytes;
@@ -114,7 +117,7 @@ void atl_ofi::mr_cache::get(fid_domain* domain, void* buf, size_t bytes, fid_mr*
         ZE_CALL(zeDeviceGetProperties, (alloc_dev, &alloc_dev_props));
 
         int dev_idx = -1;
-        for (int idx = 0; idx < ze_data.device_count; idx++) {
+        for (int idx = 0; idx < static_cast<int>(ze_data.device_count); idx++) {
             ze_device_properties_t dev_props = ccl::ze::default_device_props;
             ZE_CALL(zeDeviceGetProperties, (ze_data.devices[idx], &dev_props));
 
@@ -163,11 +166,11 @@ atl_status_t atl_ofi::atl_set_env(const atl_attr_t& attr) {
     return atl_ofi_set_env(attr);
 }
 
-atl_status_t atl_ofi::atl_init(int* argc,
-                               char*** argv,
-                               atl_attr_t* attr,
-                               const char* main_addr,
-                               std::unique_ptr<ipmi>& pmi) {
+atl_status_t atl_ofi::init(int* argc,
+                           char*** argv,
+                           atl_attr_t* attr,
+                           const char* main_addr,
+                           std::shared_ptr<ipmi> pmi) {
     inited = true;
     struct fi_info *prov_list = nullptr, *base_hints = nullptr, *prov_hints = nullptr;
     int fi_version;
@@ -191,10 +194,7 @@ atl_status_t atl_ofi::atl_init(int* argc,
 
     if (global_data.ctx_count == 0) {
         ret = atl_ofi_set_env(*attr);
-        if (ret != ATL_STATUS_SUCCESS) {
-            LOG_ERROR("atl_ofi_set_env error");
-            return ATL_STATUS_FAILURE;
-        }
+        ATL_CHECK_STATUS(ret, "atl_ofi_set_env error");
 
         fi_version_env = getenv(ATL_OFI_MAJOR_VERSION);
         if (fi_version_env) {
@@ -222,9 +222,7 @@ atl_status_t atl_ofi::atl_init(int* argc,
     ctx = &(ofi_ctx->ctx);
 
     ctx->ep_count = attr->in.ep_count;
-    ctx->eps = (atl_ep**)calloc(1, sizeof(void*) * attr->in.ep_count);
-    if (!ctx->eps)
-        goto err;
+    eps.resize(attr->in.ep_count);
 
     ctx->coord.global_count = pmi->get_size();
     ctx->coord.global_idx = pmi->get_rank();
@@ -325,7 +323,6 @@ atl_status_t atl_ofi::atl_init(int* argc,
     ofi_ctx->mnic_type = attr->in.mnic_type;
     ATL_CALL(atl_ofi_parse_mnic_name(ctx, attr->in.mnic_name), goto err);
     ofi_ctx->mnic_count = std::min(attr->in.mnic_count, (size_t)(ATL_OFI_MAX_NW_PROV_COUNT));
-    ofi_ctx->mnic_count = std::min(ofi_ctx->mnic_count, attr->in.ep_count);
     ofi_ctx->mnic_count = std::max(ofi_ctx->mnic_count, (size_t)(1));
 
     if ((ofi_ctx->mnic_type != ATL_MNIC_NONE) &&
@@ -337,6 +334,8 @@ atl_status_t atl_ofi::atl_init(int* argc,
     if (ofi_ctx->mnic_type == ATL_MNIC_NONE)
         ofi_ctx->mnic_count = 1;
 
+    ofi_ctx->mnic_offset = attr->in.mnic_offset;
+
     attr->out.tag_bits = 64;
     attr->out.max_tag = 0xFFFFFFFFFFFFFFFF;
 
@@ -397,10 +396,10 @@ atl_status_t atl_ofi::atl_init(int* argc,
             LOG_INFO("ep_idx: ", ep_idx, ", active_prov_idxs: ", ss.str());
         }
 
-        ctx->eps[ep_idx] = ep;
+        eps[ep_idx] = ep;
     }
 
-    pmi->pmrt_barrier();
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
 
     max_retry_count_env = getenv(ATL_OFI_MAX_RETRY_COUNT_ENV);
     if (max_retry_count_env) {
@@ -428,10 +427,11 @@ atl_status_t atl_ofi::atl_init(int* argc,
         LOG_INFO("  prov_count: ", ofi_ctx->prov_count);
         LOG_INFO("  nw_prov_count: ", ofi_ctx->nw_prov_count);
         LOG_INFO("  nw_prov_first_idx: ", ofi_ctx->nw_prov_first_idx);
-        LOG_INFO("  mnic_type: ", ofi_ctx->mnic_type);
+        LOG_INFO("  mnic_type: ", to_string(ofi_ctx->mnic_type));
         LOG_INFO("  mnic_include_names: ", vec_to_string(ofi_ctx->mnic_include_names));
         LOG_INFO("  mnic_exclude_names: ", vec_to_string(ofi_ctx->mnic_exclude_names));
         LOG_INFO("  mnic_count: ", ofi_ctx->mnic_count);
+        LOG_INFO("  mnic_offset: ", to_string(ofi_ctx->mnic_offset));
         LOG_INFO("  max_retry_count: ", ofi_ctx->max_retry_count);
         LOG_INFO("  progress_mode: ", ofi_ctx->progress_mode);
 #ifdef CCL_ENABLE_OFI_HMEM
@@ -468,12 +468,12 @@ atl_status_t atl_ofi::atl_init(int* argc,
     }
 
     if (ctx != nullptr)
-        atl_finalize();
+        finalize();
 
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t atl_ofi::atl_finalize() {
+atl_status_t atl_ofi::finalize() {
     is_finalized = true;
     int ret = 0;
     size_t idx;
@@ -493,13 +493,14 @@ atl_status_t atl_ofi::atl_finalize() {
     }
 
     for (idx = 0; idx < ctx->ep_count; idx++) {
-        atl_ofi_ep_t* ofi_ep = container_of(ctx->eps[idx], atl_ofi_ep_t, ep);
+        atl_ofi_ep_t* ofi_ep = container_of(eps[idx], atl_ofi_ep_t, ep);
         free(ofi_ep);
     }
 
     if (global_data.ctx_count == 0) {
         if (global_data.dlhandle) {
             dlclose(global_data.dlhandle);
+            global_data.dlhandle = nullptr;
         }
 
         if (ctx->coord.global_idx == 0) {
@@ -507,20 +508,19 @@ atl_status_t atl_ofi::atl_finalize() {
         }
     }
 
-    free(ctx->eps);
     free(ofi_ctx);
 
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_update(std::unique_ptr<ipmi>& pmi) {
+atl_status_t atl_ofi::update(std::shared_ptr<ipmi> pmi) {
     int ret;
     size_t prov_idx;
 
     atl_ofi_ctx_t* ofi_ctx;
     ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx);
 
-    pmi->pmrt_barrier();
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
 
     atl_ofi_reset(ctx);
     memset(&(ctx->coord), 0, sizeof(atl_proc_coord_t));
@@ -555,21 +555,21 @@ atl_status_t atl_ofi::atl_update(std::unique_ptr<ipmi>& pmi) {
             return RET2ATL(ret);
     }
 
-    pmi->pmrt_barrier();
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
 
     /* normal end of execution */
     return RET2ATL(ret);
 }
 
-atl_ep_t** atl_ofi::atl_get_eps() {
-    return ctx->eps;
+std::vector<atl_ep_t*> atl_ofi::get_eps() {
+    return eps;
 }
 
-atl_proc_coord_t* atl_ofi::atl_get_proc_coord() {
+atl_proc_coord_t* atl_ofi::get_proc_coord() {
     return &(ctx->coord);
 }
 
-atl_status_t atl_ofi::atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
+atl_status_t atl_ofi::mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
     int ret;
     atl_ofi_ctx_t* ofi_ctx;
     ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx);
@@ -605,7 +605,7 @@ atl_status_t atl_ofi::atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) {
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t atl_ofi::atl_mr_dereg(atl_mr_t* mr) {
+atl_status_t atl_ofi::mr_dereg(atl_mr_t* mr) {
     atl_ofi_mr_t* ofi_mr;
     ofi_mr = container_of(mr, atl_ofi_mr_t, mr);
     int ret = fi_close(&ofi_mr->fi_mr->fid);
@@ -613,12 +613,12 @@ atl_status_t atl_ofi::atl_mr_dereg(atl_mr_t* mr) {
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_ep_send(atl_ep_t* ep,
-                                  const void* buf,
-                                  size_t len,
-                                  int dst_proc_idx,
-                                  uint64_t tag,
-                                  atl_req_t* req) {
+atl_status_t atl_ofi::send(atl_ep_t* ep,
+                           const void* buf,
+                           size_t len,
+                           int dst_proc_idx,
+                           uint64_t tag,
+                           atl_req_t* req) {
     ssize_t ret;
 
     atl_ofi_prov_t* prov;
@@ -627,14 +627,10 @@ atl_status_t atl_ofi::atl_ep_send(atl_ep_t* ep,
 
     prov = atl_ofi_get_prov(ep, dst_proc_idx, len);
     prov_ep = &(prov->eps[ep->idx]);
-    ofi_req = ((atl_ofi_req_t*)req->internal);
 
-    req->tag = tag;
-    req->remote_proc_idx = dst_proc_idx;
-    ofi_req->comp_state = ATL_OFI_COMP_POSTED;
+    atl_ofi_init_req(req, prov_ep, prov_ep->tx);
 
-    ofi_req->prov_ep = prov_ep;
-    ofi_req->fi_ep = prov_ep->tx;
+    ofi_req = ((atl_ofi_req_t*)req->internal);
 
     cache.get(ep->idx, prov->domain, const_cast<void*>(buf), len, &ofi_req->mr);
     void* desc = (ofi_req->mr) ? fi_mr_desc(ofi_req->mr) : nullptr;
@@ -658,12 +654,12 @@ atl_status_t atl_ofi::atl_ep_send(atl_ep_t* ep,
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_ep_recv(atl_ep_t* ep,
-                                  void* buf,
-                                  size_t len,
-                                  int src_proc_idx,
-                                  uint64_t tag,
-                                  atl_req_t* req) {
+atl_status_t atl_ofi::recv(atl_ep_t* ep,
+                           void* buf,
+                           size_t len,
+                           int src_proc_idx,
+                           uint64_t tag,
+                           atl_req_t* req) {
     ssize_t ret;
 
     atl_ofi_prov_t* prov;
@@ -672,14 +668,10 @@ atl_status_t atl_ofi::atl_ep_recv(atl_ep_t* ep,
 
     prov = atl_ofi_get_prov(ep, src_proc_idx, len);
     prov_ep = &(prov->eps[ep->idx]);
-    ofi_req = ((atl_ofi_req_t*)req->internal);
 
-    req->tag = tag;
-    req->remote_proc_idx = src_proc_idx;
-    ofi_req->comp_state = ATL_OFI_COMP_POSTED;
+    atl_ofi_init_req(req, prov_ep, prov_ep->rx);
 
-    ofi_req->prov_ep = prov_ep;
-    ofi_req->fi_ep = prov_ep->rx;
+    ofi_req = ((atl_ofi_req_t*)req->internal);
 
     cache.get(ep->idx, prov->domain, const_cast<void*>(buf), len, &ofi_req->mr);
     void* desc = (ofi_req->mr) ? fi_mr_desc(ofi_req->mr) : nullptr;
@@ -703,11 +695,11 @@ atl_status_t atl_ofi::atl_ep_recv(atl_ep_t* ep,
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_ep_probe(atl_ep_t* ep,
-                                   int src_proc_idx,
-                                   uint64_t tag,
-                                   int* found,
-                                   size_t* recv_len) {
+atl_status_t atl_ofi::probe(atl_ep_t* ep,
+                            int src_proc_idx,
+                            uint64_t tag,
+                            int* found,
+                            size_t* recv_len) {
     CCL_THROW("unexpected path");
 
     atl_status_t ret;
@@ -763,7 +755,7 @@ atl_status_t atl_ofi::atl_ep_probe(atl_ep_t* ep,
     }
 
     do {
-        ret = atl_ep_poll(ep);
+        ret = poll(ep);
         if (ret != ATL_STATUS_SUCCESS)
             return ret;
 
@@ -814,82 +806,14 @@ atl_status_t atl_ofi::atl_ep_probe(atl_ep_t* ep,
     return RET2ATL(ofi_ret);
 }
 
-atl_status_t atl_ofi::atl_ep_allgatherv(atl_ep_t* ep,
-                                        const void* send_buf,
-                                        size_t send_len,
-                                        void* recv_buf,
-                                        const int* recv_lens,
-                                        const int* offsets,
-                                        atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_allreduce(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t len,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_alltoall(atl_ep_t* ep,
-                                      const void* send_buf,
-                                      void* recv_buf,
-                                      int len,
-                                      atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_alltoallv(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       const int* send_lens,
-                                       const int* send_offsets,
-                                       void* recv_buf,
-                                       const int* recv_lens,
-                                       const int* recv_offsets,
-                                       atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_bcast(atl_ep_t* ep, void* buf, size_t len, int root, atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_reduce(atl_ep_t* ep,
-                                    const void* send_buf,
-                                    void* recv_buf,
-                                    size_t len,
-                                    int root,
-                                    atl_datatype_t dtype,
-                                    atl_reduction_t op,
-                                    atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_reduce_scatter(atl_ep_t* ep,
-                                            const void* send_buf,
-                                            void* recv_buf,
-                                            size_t recv_len,
-                                            atl_datatype_t dtype,
-                                            atl_reduction_t op,
-                                            atl_req_t* req) {
-    return ATL_STATUS_UNSUPPORTED;
-}
-
-atl_status_t atl_ofi::atl_ep_read(atl_ep_t* ep,
-                                  void* buf,
-                                  size_t len,
-                                  atl_mr_t* mr,
-                                  uint64_t addr,
-                                  uintptr_t remote_key,
-                                  int dst_proc_idx,
-                                  atl_req_t* req) {
+atl_status_t atl_ofi::read(atl_ep_t* ep,
+                           void* buf,
+                           size_t len,
+                           atl_mr_t* mr,
+                           uint64_t addr,
+                           uintptr_t remote_key,
+                           int dst_proc_idx,
+                           atl_req_t* req) {
     ssize_t ret;
 
     atl_ofi_prov_t* prov;
@@ -898,14 +822,10 @@ atl_status_t atl_ofi::atl_ep_read(atl_ep_t* ep,
 
     prov = atl_ofi_get_prov(ep, dst_proc_idx, len);
     prov_ep = &(prov->eps[ep->idx]);
-    ofi_req = ((atl_ofi_req_t*)req->internal);
 
-    req->tag = 0;
-    req->remote_proc_idx = dst_proc_idx;
-    ofi_req->comp_state = ATL_OFI_COMP_POSTED;
+    atl_ofi_init_req(req, prov_ep, prov_ep->tx);
 
-    ofi_req->prov_ep = prov_ep;
-    ofi_req->fi_ep = prov_ep->tx;
+    ofi_req = ((atl_ofi_req_t*)req->internal);
 
     ATL_OFI_RETRY(fi_read(prov_ep->tx,
                           buf,
@@ -920,14 +840,14 @@ atl_status_t atl_ofi::atl_ep_read(atl_ep_t* ep,
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_ep_write(atl_ep_t* ep,
-                                   const void* buf,
-                                   size_t len,
-                                   atl_mr_t* mr,
-                                   uint64_t addr,
-                                   uintptr_t remote_key,
-                                   int dst_proc_idx,
-                                   atl_req_t* req) {
+atl_status_t atl_ofi::write(atl_ep_t* ep,
+                            const void* buf,
+                            size_t len,
+                            atl_mr_t* mr,
+                            uint64_t addr,
+                            uintptr_t remote_key,
+                            int dst_proc_idx,
+                            atl_req_t* req) {
     ssize_t ret;
 
     atl_ofi_prov_t* prov;
@@ -936,14 +856,10 @@ atl_status_t atl_ofi::atl_ep_write(atl_ep_t* ep,
 
     prov = atl_ofi_get_prov(ep, dst_proc_idx, len);
     prov_ep = &(prov->eps[ep->idx]);
-    ofi_req = ((atl_ofi_req_t*)req->internal);
 
-    req->tag = 0;
-    req->remote_proc_idx = dst_proc_idx;
-    ofi_req->comp_state = ATL_OFI_COMP_POSTED;
+    atl_ofi_init_req(req, prov_ep, prov_ep->tx);
 
-    ofi_req->prov_ep = prov_ep;
-    ofi_req->fi_ep = prov_ep->tx;
+    ofi_req = ((atl_ofi_req_t*)req->internal);
 
     ATL_OFI_RETRY(fi_write(prov_ep->tx,
                            buf,
@@ -958,7 +874,7 @@ atl_status_t atl_ofi::atl_ep_write(atl_ep_t* ep,
     return RET2ATL(ret);
 }
 
-atl_status_t atl_ofi::atl_ep_wait(atl_ep_t* ep, atl_req_t* req) {
+atl_status_t atl_ofi::wait(atl_ep_t* ep, atl_req_t* req) {
     atl_status_t ret;
     atl_ofi_req_t* ofi_req;
 
@@ -966,18 +882,18 @@ atl_status_t atl_ofi::atl_ep_wait(atl_ep_t* ep, atl_req_t* req) {
     ofi_req = ((atl_ofi_req_t*)req->internal);
 
     while ((ofi_req->comp_state != ATL_OFI_COMP_COMPLETED) &&
-           ((ret = atl_ep_poll(ep)) == ATL_STATUS_SUCCESS))
+           ((ret = poll(ep)) == ATL_STATUS_SUCCESS))
         ;
 
     return ret;
 }
 
-atl_status_t atl_ofi::atl_ep_wait_all(atl_ep_t* ep, atl_req_t* reqs, size_t count) {
+atl_status_t atl_ofi::wait_all(atl_ep_t* ep, atl_req_t* reqs, size_t count) {
     size_t i;
     atl_status_t ret;
 
     for (i = 0; i < count; i++) {
-        ret = atl_ep_wait(ep, &reqs[i]);
+        ret = wait(ep, &reqs[i]);
         if (ret != ATL_STATUS_SUCCESS)
             return ret;
     }
@@ -985,7 +901,7 @@ atl_status_t atl_ofi::atl_ep_wait_all(atl_ep_t* ep, atl_req_t* reqs, size_t coun
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t atl_ofi::atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) {
+atl_status_t atl_ofi::cancel(atl_ep_t* ep, atl_req_t* req) {
     int ret;
     atl_ofi_req_t* ofi_req;
 
@@ -1000,7 +916,7 @@ atl_status_t atl_ofi::atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) {
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t atl_ofi::atl_ep_poll(atl_ep_t* ep) {
+atl_status_t atl_ofi::poll(atl_ep_t* ep) {
     atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
     if (ofi_ctx->progress_mode == ATL_PROGRESS_POLL) {
         atl_ep_progress(ep);
@@ -1008,9 +924,7 @@ atl_status_t atl_ofi::atl_ep_poll(atl_ep_t* ep) {
     return ATL_STATUS_SUCCESS;
 }
 
-atl_status_t atl_ofi::atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) {
-    CCL_THROW_IF_NOT(is_completed);
-
+atl_status_t atl_ofi::check(atl_ep_t* ep, atl_req_t* req) {
     atl_status_t status;
     atl_ofi_req_t* ofi_req;
     atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
@@ -1018,14 +932,16 @@ atl_status_t atl_ofi::atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* r
     status = ATL_STATUS_SUCCESS;
     ofi_req = ((atl_ofi_req_t*)req->internal);
 
-    *is_completed = (ofi_req->comp_state == ATL_OFI_COMP_COMPLETED);
-    if (*is_completed) {
+    CCL_THROW_IF_NOT(!req->is_completed, "request is already completed");
+
+    req->is_completed = (ofi_req->comp_state == ATL_OFI_COMP_COMPLETED);
+    if (req->is_completed) {
         return ATL_STATUS_SUCCESS;
     }
 
     if (ofi_ctx->progress_mode == ATL_PROGRESS_CHECK) {
         status = atl_ep_progress(ep);
-        *is_completed = (ofi_req->comp_state == ATL_OFI_COMP_COMPLETED);
+        req->is_completed = (ofi_req->comp_state == ATL_OFI_COMP_COMPLETED);
     }
 
     return status;
@@ -1033,7 +949,7 @@ atl_status_t atl_ofi::atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* r
 
 atl_ofi::~atl_ofi() {
     if (!is_finalized) {
-        atl_finalize();
+        finalize();
     }
 }
 
diff --git a/src/atl/ofi/atl_ofi.hpp b/src/atl/ofi/atl_ofi.hpp
index a06a45648..4d7b44079 100644
--- a/src/atl/ofi/atl_ofi.hpp
+++ b/src/atl/ofi/atl_ofi.hpp
@@ -13,146 +13,86 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#pragma once
 #include <iostream>
 #include <memory>
 #include <rdma/fi_domain.h>
 #include <unordered_map>
 
-#include "atl.h"
 #include "atl_ofi_helper.hpp"
 #include "common/utils/hash.hpp"
 
-class atl_ofi final : public iatl {
+class atl_ofi {
 public:
     atl_ofi() = default;
 
-    ~atl_ofi() override;
+    ~atl_ofi();
 
     static atl_status_t atl_set_env(const atl_attr_t& attr);
 
-    atl_status_t atl_init(int* argc,
-                          char*** argv,
-                          atl_attr_t* attr,
-                          const char* main_addr,
-                          std::unique_ptr<ipmi>& pmi) override;
-
-    atl_status_t atl_update(std::unique_ptr<ipmi>& pmi) override;
-
-    atl_ep_t** atl_get_eps() override;
-
-    atl_proc_coord_t* atl_get_proc_coord() override;
-
-    atl_status_t atl_mr_reg(const void* buf, size_t len, atl_mr_t** mr) override;
-
-    atl_status_t atl_mr_dereg(atl_mr_t* mr) override;
-
-    atl_status_t atl_ep_send(atl_ep_t* ep,
-                             const void* buf,
-                             size_t len,
-                             int dst_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_recv(atl_ep_t* ep,
-                             void* buf,
-                             size_t len,
-                             int src_proc_idx,
-                             uint64_t tag,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_probe(atl_ep_t* ep,
-                              int src_proc_idx,
-                              uint64_t tag,
-                              int* found,
-                              size_t* recv_len) override;
-
-    atl_status_t atl_ep_allgatherv(atl_ep_t* ep,
-                                   const void* send_buf,
-                                   size_t send_len,
-                                   void* recv_buf,
-                                   const int* recv_lens,
-                                   const int* offsets,
-                                   atl_req_t* req) override;
-
-    atl_status_t atl_ep_allreduce(atl_ep_t* ep,
-                                  const void* send_buf,
-                                  void* recv_buf,
-                                  size_t len,
-                                  atl_datatype_t dtype,
-                                  atl_reduction_t op,
-                                  atl_req_t* req) override;
-
-    atl_status_t atl_ep_alltoall(atl_ep_t* ep,
-                                 const void* send_buf,
-                                 void* recv_buf,
-                                 int len,
-                                 atl_req_t* req) override;
-
-    atl_status_t atl_ep_alltoallv(atl_ep_t* ep,
-                                  const void* send_buf,
-                                  const int* send_lens,
-                                  const int* send_offsets,
-                                  void* recv_buf,
-                                  const int* recv_lens,
-                                  const int* recv_offsets,
-                                  atl_req_t* req) override;
-
-    atl_status_t atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_bcast(atl_ep_t* ep,
-                              void* buf,
-                              size_t len,
-                              int root,
-                              atl_req_t* req) override;
-
-    atl_status_t atl_ep_reduce(atl_ep_t* ep,
-                               const void* send_buf,
-                               void* recv_buf,
-                               size_t len,
-                               int root,
-                               atl_datatype_t dtype,
-                               atl_reduction_t op,
-                               atl_req_t* req) override;
-
-    atl_status_t atl_ep_reduce_scatter(atl_ep_t* ep,
-                                       const void* send_buf,
-                                       void* recv_buf,
-                                       size_t recv_len,
-                                       atl_datatype_t dtype,
-                                       atl_reduction_t op,
-                                       atl_req_t* req) override;
-
-    atl_status_t atl_ep_read(atl_ep_t* ep,
-                             void* buf,
-                             size_t len,
-                             atl_mr_t* mr,
-                             uint64_t addr,
-                             uintptr_t remote_key,
-                             int dst_proc_idx,
-                             atl_req_t* req) override;
-
-    atl_status_t atl_ep_write(atl_ep_t* ep,
-                              const void* buf,
-                              size_t len,
-                              atl_mr_t* mr,
-                              uint64_t addr,
-                              uintptr_t remote_key,
-                              int dst_proc_idx,
-                              atl_req_t* req) override;
-
-    atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_wait_all(atl_ep_t* ep, atl_req_t* req, size_t count) override;
-
-    atl_status_t atl_ep_cancel(atl_ep_t* ep, atl_req_t* req) override;
-
-    atl_status_t atl_ep_poll(atl_ep_t* ep) override;
-
-    atl_status_t atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) override;
-
-    atl_status_t atl_finalize() override;
-
-    bool is_inited() override {
+    atl_status_t init(int* argc,
+                      char*** argv,
+                      atl_attr_t* attr,
+                      const char* main_addr,
+                      std::shared_ptr<ipmi> pmi);
+
+    atl_status_t update(std::shared_ptr<ipmi> pmi);
+
+    std::vector<atl_ep_t*> get_eps();
+
+    atl_proc_coord_t* get_proc_coord();
+
+    atl_status_t mr_reg(const void* buf, size_t len, atl_mr_t** mr);
+
+    atl_status_t mr_dereg(atl_mr_t* mr);
+
+    atl_status_t send(atl_ep_t* ep,
+                      const void* buf,
+                      size_t len,
+                      int dst_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req);
+
+    atl_status_t recv(atl_ep_t* ep,
+                      void* buf,
+                      size_t len,
+                      int src_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req);
+
+    atl_status_t probe(atl_ep_t* ep, int src_proc_idx, uint64_t tag, int* found, size_t* recv_len);
+
+    atl_status_t read(atl_ep_t* ep,
+                      void* buf,
+                      size_t len,
+                      atl_mr_t* mr,
+                      uint64_t addr,
+                      uintptr_t remote_key,
+                      int dst_proc_idx,
+                      atl_req_t* req);
+
+    atl_status_t write(atl_ep_t* ep,
+                       const void* buf,
+                       size_t len,
+                       atl_mr_t* mr,
+                       uint64_t addr,
+                       uintptr_t remote_key,
+                       int dst_proc_idx,
+                       atl_req_t* req);
+
+    atl_status_t wait(atl_ep_t* ep, atl_req_t* req);
+
+    atl_status_t wait_all(atl_ep_t* ep, atl_req_t* req, size_t count);
+
+    atl_status_t cancel(atl_ep_t* ep, atl_req_t* req);
+
+    atl_status_t poll(atl_ep_t* ep);
+
+    atl_status_t check(atl_ep_t* ep, atl_req_t* req);
+
+    atl_status_t finalize();
+
+    bool is_inited() {
         return inited;
     }
 
@@ -162,6 +102,7 @@ class atl_ofi final : public iatl {
     atl_status_t atl_prov_ep_handle_cq_err(atl_ofi_prov_ep_t* ep);
 
     atl_ctx_t* ctx = nullptr;
+    std::vector<atl_ep_t*> eps;
 
     class mr_cache {
     public:
diff --git a/src/atl/ofi/atl_ofi_comm.cpp b/src/atl/ofi/atl_ofi_comm.cpp
new file mode 100644
index 000000000..63a86f063
--- /dev/null
+++ b/src/atl/ofi/atl_ofi_comm.cpp
@@ -0,0 +1,226 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "atl/ofi/atl_ofi_comm.hpp"
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h"
+#include "atl/util/pm/pmi_rt/pmi_simple.h"
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable.h"
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h"
+#include "atl/ofi/atl_ofi.hpp"
+#include "exec/exec.hpp"
+
+std::atomic<size_t> atl_ofi_comm::comm_count{ 0 };
+atl_ofi* atl_ofi_comm::transport{ nullptr };
+
+atl_ofi_comm::~atl_ofi_comm() {
+    static std::mutex memory_mutex;
+    std::lock_guard<std::mutex> lock(memory_mutex);
+    tag.reset();
+    comm_count--;
+    if (comm_count.load() == 0) {
+        delete transport;
+        transport = nullptr;
+    }
+}
+
+atl_ofi_comm::atl_ofi_comm() {
+    char* pm_type_str = getenv(PM_TYPE);
+
+    if (pm_type_str) {
+        if (strstr(pm_type_str, PM_RT_VAL_SIMPLE)) {
+            pmi = std::shared_ptr<ipmi>(new pmi_simple());
+        }
+        else if (strstr(pm_type_str, PM_RT_VAL_RESIZABLE)) {
+            std::shared_ptr<ikvs_wrapper> k(new internal_kvs());
+            pmi = std::shared_ptr<ipmi>(new pmi_resizable(k));
+        }
+        else {
+            LOG_ERROR("Unknown %s: %s\n", PM_TYPE, pm_type_str);
+        }
+    }
+    else {
+        pmi = std::shared_ptr<ipmi>(new pmi_simple());
+    }
+
+    CCL_THROW_IF_NOT(init_transport(true) == ATL_STATUS_SUCCESS, "init transport failed");
+}
+
+atl_ofi_comm::atl_ofi_comm(std::shared_ptr<ikvs_wrapper> k) {
+    char* pm_type_str = getenv(PM_TYPE);
+
+    if (pm_type_str) {
+        if (strstr(pm_type_str, PM_RT_VAL_SIMPLE)) {
+            pmi = std::shared_ptr<ipmi>(new pmi_simple());
+        }
+        else if (strstr(pm_type_str, PM_RT_VAL_RESIZABLE)) {
+            pmi = std::shared_ptr<ipmi>(new pmi_resizable(k));
+        }
+        else {
+            LOG_ERROR("Unknown %s: %s\n", PM_TYPE, pm_type_str);
+        }
+    }
+    else {
+        pmi = std::shared_ptr<ipmi>(new pmi_simple());
+    }
+
+    CCL_THROW_IF_NOT(init_transport(true) == ATL_STATUS_SUCCESS, "init transport failed");
+}
+
+atl_ofi_comm::atl_ofi_comm(int total_rank_count,
+                           const std::vector<int>& ranks,
+                           std::shared_ptr<ikvs_wrapper> k) {
+    std::shared_ptr<internal_kvs> kvs;
+    if ((kvs = std::dynamic_pointer_cast<internal_kvs>(k)) != nullptr) {
+        pmi =
+            std::shared_ptr<ipmi>(new pmi_resizable_simple_internal(total_rank_count, ranks, kvs));
+    }
+    else {
+        pmi = std::shared_ptr<ipmi>(new pmi_resizable_simple(total_rank_count, ranks, k));
+    }
+
+    CCL_THROW_IF_NOT(init_transport(true) == ATL_STATUS_SUCCESS, "init transport failed");
+}
+atl_status_t atl_ofi_comm::init_transport(bool is_new) {
+    LOG_DEBUG("init ATL, requested ep_count ", attr.in.ep_count);
+
+    if (is_new) {
+        ATL_CHECK_STATUS(pmi->pmrt_init(), "pmi init failed");
+        static std::mutex memory_mutex;
+        {
+            std::lock_guard<std::mutex> lock(memory_mutex);
+            if (!transport) {
+                transport = new atl_ofi();
+            }
+            if (!transport->is_inited()) {
+                CCL_THROW_IF_NOT(
+                    transport->init(nullptr, nullptr, &attr, nullptr, pmi) == ATL_STATUS_SUCCESS,
+                    "failed to initialize ATL");
+
+                if (pmi->get_rank() == 0) {
+                    print_atl_attrs();
+                }
+            }
+        }
+        eps = transport->get_eps();
+        coord = *transport->get_proc_coord();
+
+        parent_rank = rank = pmi->get_rank();
+        parent_size = size = pmi->get_size();
+
+        rank2rank_map.resize(size);
+        for (int i = 0; i < size; i++) {
+            rank2rank_map[i] = i;
+        }
+    }
+
+    threads_per_process = pmi->get_threads_per_process();
+    ranks_per_process = pmi->get_ranks_per_process();
+    comm_count++;
+
+    init_tag();
+
+    if (pmi->get_local_thread_idx() == 0) {
+        executor_update();
+    }
+
+    return ATL_STATUS_SUCCESS;
+}
+
+std::shared_ptr<atl_base_comm> atl_ofi_comm::comm_split(int color) {
+    return std::shared_ptr<atl_base_comm>(new atl_ofi_comm(this, color));
+}
+
+atl_ofi_comm::atl_ofi_comm(atl_ofi_comm* parent, int color) {
+    eps = parent->eps;
+    parent_size = parent->parent_size;
+    parent_rank = parent->parent_rank;
+    pmi = parent->pmi;
+
+    coord.hostname_hash = transport->get_proc_coord()->hostname_hash;
+    coord.local_count = 0;
+    coord.local_idx = 0;
+
+    std::vector<rank_info_t> ranks_info(parent_size);
+    rank_info_t rank_info{ color, parent_rank, coord.hostname_hash };
+    parent->rank_info_exchange(ranks_info, rank_info);
+
+    size = 0;
+    for (auto& it : ranks_info) {
+        int recv_color;
+        int recv_rank;
+        size_t recv_hash;
+        std::tie(recv_color, recv_rank, recv_hash) = it;
+        if (recv_color == color) {
+            rank2rank_map.push_back(recv_rank);
+
+            if (recv_hash == coord.hostname_hash) {
+                coord.local_count++;
+            }
+            if (recv_rank == parent_rank) {
+                coord.global_idx = rank = size;
+                coord.local_idx = coord.local_count;
+            }
+            size++;
+        }
+    }
+    coord.global_count = size;
+    CCL_THROW_IF_NOT(init_transport(false) == ATL_STATUS_SUCCESS, "init transport failed");
+}
+
+void atl_ofi_comm::rank_info_exchange(std::vector<rank_info_t>& ranks_info, rank_info_t rank_info) {
+    std::vector<atl_req> send_reqs(size - 1);
+    std::vector<atl_req> recv_reqs(size - 1);
+    const size_t ep_idx = 0;
+
+    for (int i = 0, j = 0; i < size; i++) {
+        if (i == rank)
+            continue;
+        atl_status_t ret;
+        do {
+            ret =
+                send(ep_idx, &rank_info, sizeof(rank_info_t), i, rank * 1000 + i, &(send_reqs[j]));
+            CCL_THROW_IF_NOT(ret != ATL_STATUS_FAILURE, "send failed");
+            ccl_yield(ccl::global_data::env().yield_type);
+        } while (ret == ATL_STATUS_AGAIN);
+
+        do {
+            ret = recv(
+                ep_idx, &ranks_info[i], sizeof(rank_info_t), i, i * 1000 + rank, &(recv_reqs[j]));
+            CCL_THROW_IF_NOT(ret != ATL_STATUS_FAILURE, "recv failed");
+            ccl_yield(ccl::global_data::env().yield_type);
+        } while (ret == ATL_STATUS_AGAIN);
+        j++;
+    }
+
+    ranks_info[rank] = rank_info;
+    bool is_completed = false;
+    while (!is_completed) {
+        is_completed = true;
+        poll(ep_idx);
+        for (size_t i = 0; i < send_reqs.size(); i++) {
+            if (!send_reqs[i].is_completed) {
+                CCL_THROW_IF_NOT(check(ep_idx, &(send_reqs[i])) != ATL_STATUS_FAILURE,
+                                 "check send failed");
+                is_completed = false;
+            }
+            if (!recv_reqs[i].is_completed) {
+                CCL_THROW_IF_NOT(check(ep_idx, &(recv_reqs[i])) != ATL_STATUS_FAILURE,
+                                 "check recv failed");
+                is_completed = false;
+            }
+        }
+    }
+}
diff --git a/src/atl/ofi/atl_ofi_comm.hpp b/src/atl/ofi/atl_ofi_comm.hpp
new file mode 100644
index 000000000..068a43584
--- /dev/null
+++ b/src/atl/ofi/atl_ofi_comm.hpp
@@ -0,0 +1,245 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "atl/atl_base_comm.hpp"
+#include "atl/ofi/atl_ofi.hpp"
+
+class atl_ofi_comm : public atl_base_comm {
+public:
+    ~atl_ofi_comm() override;
+    atl_ofi_comm();
+    atl_ofi_comm(std::shared_ptr<ikvs_wrapper> k);
+    atl_ofi_comm(int total_rank_count,
+                 const std::vector<int>& ranks,
+                 std::shared_ptr<ikvs_wrapper> k);
+
+    atl_status_t main_addr_reserve(char* main_addr) override {
+        return pmi->pmrt_main_addr_reserve(main_addr);
+    }
+
+    atl_status_t finalize() override {
+        ATL_CHECK_STATUS(pmi->pmrt_finalize(), "failed to finalize PMI");
+
+        return transport->finalize();
+    }
+
+    atl_status_t update() override {
+        return transport->update(pmi);
+    }
+
+    atl_status_t wait_notification() override {
+        return pmi->pmrt_wait_notification();
+    }
+
+    atl_status_t set_resize_function(atl_resize_fn_t fn) override {
+        return pmi->pmrt_set_resize_function(fn);
+    }
+
+    atl_status_t mr_reg(const void* buf, size_t len, atl_mr_t** mr) override {
+        return transport->mr_reg(buf, len, mr);
+    }
+
+    atl_status_t mr_dereg(atl_mr_t* mr) override {
+        return transport->mr_dereg(mr);
+    }
+
+    atl_status_t send(size_t ep_idx,
+                      const void* buf,
+                      size_t len,
+                      int dst_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req) override {
+        return transport->send(eps[ep_idx], buf, len, rank2rank_map[dst_proc_idx], tag, req);
+    }
+
+    atl_status_t recv(size_t ep_idx,
+                      void* buf,
+                      size_t len,
+                      int src_proc_idx,
+                      uint64_t tag,
+                      atl_req_t* req) override {
+        return transport->recv(eps[ep_idx], buf, len, rank2rank_map[src_proc_idx], tag, req);
+    }
+
+    atl_status_t probe(size_t ep_idx,
+                       int src_proc_idx,
+                       uint64_t tag,
+                       int* found,
+                       size_t* recv_len) override {
+        return transport->probe(eps[ep_idx], rank2rank_map[src_proc_idx], tag, found, recv_len);
+    }
+
+    atl_status_t allgatherv(size_t ep_idx,
+                            const void* send_buf,
+                            size_t send_len,
+                            void* recv_buf,
+                            const int* recv_lens,
+                            const int* offsets,
+                            atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t allreduce(size_t ep_idx,
+                           const void* send_buf,
+                           void* recv_buf,
+                           size_t len,
+                           atl_datatype_t dtype,
+                           atl_reduction_t op,
+                           atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t alltoall(size_t ep_idx,
+                          const void* send_buf,
+                          void* recv_buf,
+                          int len,
+                          atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t alltoallv(size_t ep_idx,
+                           const void* send_buf,
+                           const int* send_lens,
+                           const int* send_offsets,
+                           void* recv_buf,
+                           const int* recv_lens,
+                           const int* recv_offsets,
+                           atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t barrier(size_t ep_idx, atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t reduce(size_t ep_idx,
+                        const void* send_buf,
+                        void* recv_buf,
+                        size_t len,
+                        int root,
+                        atl_datatype_t dtype,
+                        atl_reduction_t op,
+                        atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t reduce_scatter(size_t ep_idx,
+                                const void* send_buf,
+                                void* recv_buf,
+                                size_t recv_len,
+                                atl_datatype_t dtype,
+                                atl_reduction_t op,
+                                atl_req_t* req) override {
+        return ATL_STATUS_UNSUPPORTED;
+    }
+
+    atl_status_t read(size_t ep_idx,
+                      void* buf,
+                      size_t len,
+                      atl_mr_t* mr,
+                      uint64_t addr,
+                      uintptr_t remote_key,
+                      int dst_proc_idx,
+                      atl_req_t* req) override {
+        return transport->read(
+            eps[ep_idx], buf, len, mr, addr, remote_key, rank2rank_map[dst_proc_idx], req);
+    }
+
+    atl_status_t write(size_t ep_idx,
+                       const void* buf,
+                       size_t len,
+                       atl_mr_t* mr,
+                       uint64_t addr,
+                       uintptr_t remote_key,
+                       int dst_proc_idx,
+                       atl_req_t* req) override {
+        return transport->write(
+            eps[ep_idx], buf, len, mr, addr, remote_key, rank2rank_map[dst_proc_idx], req);
+    }
+
+    atl_status_t wait(size_t ep_idx, atl_req_t* req) override {
+        return transport->wait(eps[ep_idx], req);
+    }
+
+    atl_status_t wait_all(size_t ep_idx, atl_req_t* req, size_t count) override {
+        return transport->wait_all(eps[ep_idx], req, count);
+    }
+
+    atl_status_t cancel(size_t ep_idx, atl_req_t* req) override {
+        return transport->cancel(eps[ep_idx], req);
+    }
+
+    atl_status_t poll(size_t ep_idx) override {
+        return transport->poll(eps[ep_idx]);
+    }
+
+    atl_status_t check(size_t ep_idx, atl_req_t* req) override {
+        return transport->check(eps[ep_idx], req);
+    }
+
+    size_t get_threads_per_process() override {
+        return threads_per_process;
+    }
+
+    size_t get_ranks_per_process() override {
+        return ranks_per_process;
+    }
+
+    int get_rank() override {
+        return rank;
+    }
+
+    int get_size() override {
+        return size;
+    }
+
+    int get_r2r_color() override {
+        return coord.local_idx;
+    }
+
+    int get_host_color() override {
+        return coord.hostname_hash;
+    }
+
+    /*
+     * TODO: Temporary change.
+     * Need to define correct to unique id
+     */
+    size_t get_id() override {
+        return 0;
+    }
+
+    std::shared_ptr<atl_base_comm> comm_split(int color) override;
+
+    std::vector<int> get_rank2rank_map() override {
+        return rank2rank_map;
+    }
+
+private:
+    static atl_ofi* transport;
+    std::vector<atl_ep_t*> eps;
+    static std::atomic<size_t> comm_count;
+
+    atl_ofi_comm(atl_ofi_comm* parent, int color);
+    atl_status_t init_transport(bool is_new);
+    using rank_info_t = std::tuple<int, int, size_t>;
+    void rank_info_exchange(std::vector<rank_info_t>& ranks_info, rank_info_t rank_info);
+};
diff --git a/src/atl/ofi/atl_ofi_helper.cpp b/src/atl/ofi/atl_ofi_helper.cpp
index 7218da8d1..8caa2fd53 100644
--- a/src/atl/ofi/atl_ofi_helper.cpp
+++ b/src/atl/ofi/atl_ofi_helper.cpp
@@ -84,6 +84,50 @@ std::string atl_ofi_get_nic_name(const struct fi_info* prov) {
     return ss.str();
 }
 
+const char* atl_ofi_link_state_str(enum fi_link_state state) {
+    switch (state) {
+        case FI_LINK_DOWN: return "down";
+        case FI_LINK_UP: return "up";
+        default: return "unknown";
+    }
+}
+
+std::string atl_ofi_get_nic_info(const struct fi_info* prov) {
+    std::stringstream ss;
+
+    ss << "{ ";
+
+    ss << "name " << atl_ofi_get_nic_name(prov);
+
+    if (prov->nic && prov->nic->link_attr) {
+        ss << ", state " << atl_ofi_link_state_str(prov->nic->link_attr->state);
+
+        if (prov->nic->link_attr->mtu) {
+            ss << ", mtu " << prov->nic->link_attr->mtu << " bytes";
+        }
+
+        if (prov->nic->link_attr->speed) {
+            const float bits_to_gbytes_coef = 8.0 * 1000 * 1000 * 1000;
+            ss << ", speed " << (float)prov->nic->link_attr->speed / bits_to_gbytes_coef << " GB/s";
+        }
+
+        if (prov->nic->link_attr->address) {
+            ss << ", address " << prov->nic->link_attr->address;
+        }
+
+        if (prov->nic->link_attr->network_type) {
+            ss << ", network_type " << prov->nic->link_attr->network_type;
+        }
+    }
+    else {
+        ss << ", no link attr";
+    }
+
+    ss << " }";
+
+    return ss.str();
+}
+
 atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_size) {
     size_t prov_idx;
     atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
@@ -107,9 +151,11 @@ atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_siz
         prov_idx = ofi_ctx->nw_prov_first_idx + nw_prov_offset;
     }
 
-    LOG_DEBUG("get_prov: ep_idx ",
+    LOG_DEBUG("select nic: ep_idx ",
               ep->idx,
-              ", prov_idx ",
+              ", local_proc_idx ",
+              coord->local_idx,
+              ", nic_idx ",
               prov_idx,
               ", my_node_idx ",
               my_node_idx,
@@ -136,7 +182,7 @@ fi_addr_t atl_ofi_get_addr(atl_ctx_t* ctx, atl_ofi_prov_t* prov, int proc_idx, s
     return *(prov->addr_table + ((ctx->ep_count * (proc_idx - prov->first_proc_idx)) + ep_idx));
 }
 
-atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::unique_ptr<ipmi>& pmi) {
+atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::shared_ptr<ipmi> pmi) {
     CCL_THROW_IF_NOT(ofi_ctx, "ofi_ctx is null");
 
     atl_proc_coord_t* coord = &(ofi_ctx->ctx.coord);
@@ -178,7 +224,7 @@ atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::unique_pt
         goto fn_err;
     }
 
-    pmi->pmrt_barrier();
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
 
     all_hostnames = (char*)calloc(1, coord->global_count * ATL_MAX_HOSTNAME_LEN);
     if (!all_hostnames) {
@@ -225,7 +271,7 @@ atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::unique_pt
 
 atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
                                             size_t prov_idx,
-                                            std::unique_ptr<ipmi>& pmi) {
+                                            std::shared_ptr<ipmi> pmi) {
     CCL_THROW_IF_NOT(ofi_ctx, "ofi_ctx is null");
 
     atl_ctx_t* ctx = &(ofi_ctx->ctx);
@@ -286,7 +332,7 @@ atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
         return ATL_STATUS_FAILURE;
     }
 
-    pmi->pmrt_barrier();
+    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
 
     /* retrieve all OFI EP names in order */
     for (i = 0; i < ctx->coord.global_count; i++) {
@@ -445,7 +491,7 @@ atl_status_t atl_ofi_prov_ep_get_name(atl_ofi_prov_t* prov, size_t ep_idx) {
 
 atl_status_t atl_ofi_prov_eps_connect(atl_ofi_ctx_t* ofi_ctx,
                                       size_t prov_idx,
-                                      std::unique_ptr<ipmi>& pmi) {
+                                      std::shared_ptr<ipmi> pmi) {
     int ret;
     size_t ep_idx;
 
@@ -811,6 +857,8 @@ atl_status_t atl_ofi_set_env(const atl_attr_t& attr) {
     setenv("FI_SHM_DISABLE_CMA", "1", 0);
 #endif // CCL_ENABLE_SYCL
 
+    setenv("FI_MLX_MULTI_EP", "1", 0);
+
     atl_ofi_adjust_env(attr);
 
     /*
@@ -889,7 +937,7 @@ atl_status_t atl_ofi_prov_init(atl_ctx_t* ctx,
                                struct fi_info* info,
                                atl_ofi_prov_t* prov,
                                atl_attr_t* attr,
-                               std::unique_ptr<ipmi>& pmi) {
+                               std::shared_ptr<ipmi> pmi) {
     struct fi_av_attr av_attr;
     size_t ep_idx = 0;
     ssize_t ret = 0;
@@ -900,7 +948,7 @@ atl_status_t atl_ofi_prov_init(atl_ctx_t* ctx,
 
     if (ctx->coord.global_idx == 0) {
         LOG_INFO("provider: ", info->fabric_attr->prov_name);
-        LOG_INFO("  nic: ", atl_ofi_get_nic_name(info));
+        LOG_INFO("  nic: ", atl_ofi_get_nic_info(info));
         LOG_INFO("  mr_mode: ", info->domain_attr->mr_mode);
         LOG_INFO("  threading: ", info->domain_attr->threading);
         LOG_INFO("  tx_ctx_cnt: ", info->domain_attr->tx_ctx_cnt);
@@ -983,15 +1031,15 @@ atl_status_t atl_ofi_adjust_out_tag(atl_ofi_prov_t* prov, atl_attr_t* attr) {
 
     const char* prov_name = prov->info->fabric_attr->prov_name;
 
-    CCL_THROW_IF_NOT(attr->out.tag_bits > 0,
-                     "unexpected tag_bits ",
-                     attr->out.tag_bits,
-                     " for prov ",
-                     prov_name);
-
-    CCL_THROW_IF_NOT(
-        attr->out.max_tag > 0, "unexpected max_tag ", attr->out.max_tag, " for prov ", prov_name);
+    if (!(attr->out.tag_bits > 0)) {
+        LOG_ERROR("unexpected tag_bits ", attr->out.tag_bits, " for prov ", prov_name);
+        return ATL_STATUS_FAILURE;
+    }
 
+    if (!(attr->out.max_tag > 0)) {
+        LOG_ERROR("unexpected max_tag ", attr->out.max_tag, " for prov ", prov_name);
+        return ATL_STATUS_FAILURE;
+    }
     LOG_INFO(prov_name,
              " tag_bits: ",
              attr->out.tag_bits,
@@ -1003,12 +1051,21 @@ atl_status_t atl_ofi_adjust_out_tag(atl_ofi_prov_t* prov, atl_attr_t* attr) {
     return ATL_STATUS_SUCCESS;
 }
 
+static bool atl_ofi_is_nic_down(struct fi_info* prov) {
+    if (prov->nic && prov->nic->link_attr->state == FI_LINK_DOWN) {
+        return true;
+    }
+
+    return false;
+}
+
 /* determine if NIC has already been included in others */
 int atl_ofi_nic_already_used(const struct fi_info* prov,
-                             struct fi_info** others,
-                             size_t nic_count) {
-    for (size_t i = 0; i < nic_count; i++) {
-        if (prov->nic && others[i]->nic && prov->nic->bus_attr->bus_type == FI_BUS_PCI &&
+                             const std::vector<struct fi_info*>& others,
+                             bool check_pci = false) {
+    for (size_t i = 0; i < others.size(); i++) {
+        if (check_pci && prov->nic && others[i]->nic &&
+            prov->nic->bus_attr->bus_type == FI_BUS_PCI &&
             others[i]->nic->bus_attr->bus_type == FI_BUS_PCI) {
             struct fi_pci_attr pci = prov->nic->bus_attr->attr.pci;
             struct fi_pci_attr other_pci = others[i]->nic->bus_attr->attr.pci;
@@ -1164,23 +1221,31 @@ int atl_ofi_is_allowed_nic_name(atl_ofi_ctx_t* ofi_ctx, struct fi_info* info) {
     return (should_include && !should_exclude);
 }
 
+bool atl_ofi_compare_nics(const struct fi_info* nic1, const struct fi_info* nic2) {
+    if (nic1->nic && !nic2->nic) {
+        return true;
+    }
+    else if (!nic1->nic && nic2->nic) {
+        return false;
+    }
+    return (atl_ofi_get_short_nic_name(nic1) < atl_ofi_get_short_nic_name(nic2));
+}
+
 atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
                                    struct fi_info* base_hints,
                                    atl_attr_t* attr,
-                                   std::unique_ptr<ipmi>& pmi) {
+                                   std::shared_ptr<ipmi> pmi) {
     atl_status_t ret = ATL_STATUS_SUCCESS;
     struct fi_info* prov_list = nullptr;
     struct fi_info* prov_iter = nullptr;
     size_t idx = 0, prov_idx = 0;
     char* prov_name = nullptr;
     atl_ofi_prov_t* prov = nullptr;
-    size_t name_prov_count = 0;
-    size_t topo_prov_count = 0;
-    size_t final_prov_count = 0;
-    struct fi_info* name_prov_list[ATL_OFI_MAX_NW_PROV_COUNT] = { 0 };
-    struct fi_info* topo_prov_list[ATL_OFI_MAX_NW_PROV_COUNT] = { 0 };
-    struct fi_info* final_prov_list[ATL_OFI_MAX_NW_PROV_COUNT] = { 0 };
+    std::vector<struct fi_info*> name_provs;
+    std::vector<struct fi_info*> topo_provs;
+    std::vector<struct fi_info*> final_provs;
     std::set<std::string> all_nic_names;
+    int prov_offset = 0;
 
     atl_ofi_ctx_t* ofi_ctx = container_of(ctx, atl_ofi_ctx_t, ctx);
 
@@ -1197,18 +1262,23 @@ atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
     prov_iter = prov_list;
     while (prov_iter) {
         LOG_DEBUG("name filter: check nic ", atl_ofi_get_nic_name(prov_iter));
-        if (!atl_ofi_nic_already_used(prov_iter, name_prov_list, name_prov_count)) {
+        if (atl_ofi_is_nic_down(prov_iter)) {
+            LOG_DEBUG("nic ", atl_ofi_get_nic_name(prov_iter), " is in down state, skip");
+        }
+        else if (!atl_ofi_nic_already_used(prov_iter, name_provs)) {
             all_nic_names.insert(atl_ofi_get_short_nic_name(prov_iter));
             if (atl_ofi_is_allowed_nic_name(ofi_ctx, prov_iter)) {
                 LOG_DEBUG("name filter: found suitable nic ", atl_ofi_get_nic_name(prov_iter));
-                name_prov_list[name_prov_count] = fi_dupinfo(prov_iter);
-                name_prov_count++;
+                name_provs.push_back(fi_dupinfo(prov_iter));
             }
         }
         prov_iter = prov_iter->next;
     }
 
-    if (!name_prov_count) {
+    /* sort by names */
+    std::sort(name_provs.begin(), name_provs.end(), atl_ofi_compare_nics);
+
+    if (name_provs.empty()) {
         LOG_ERROR("name filter: can not find network providers",
                   ", include names: ",
                   vec_to_string(ofi_ctx->mnic_include_names),
@@ -1221,13 +1291,12 @@ atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
 
     /* 3. filter out by topo */
     if (ofi_ctx->mnic_type == ATL_MNIC_NONE) {
-        topo_prov_list[topo_prov_count] = fi_dupinfo(name_prov_list[0]);
-        topo_prov_count++;
+        topo_provs.push_back(fi_dupinfo(name_provs[0]));
     }
     else {
         struct fid_nic* nic = nullptr;
-        for (idx = 0; idx < name_prov_count; idx++) {
-            prov_iter = name_prov_list[idx];
+        for (idx = 0; idx < name_provs.size(); idx++) {
+            prov_iter = name_provs[idx];
             LOG_DEBUG("topo filter: check nic ", atl_ofi_get_nic_name(prov_iter));
             nic = prov_iter->nic;
 
@@ -1236,15 +1305,14 @@ atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
                       ", has nic_attr ",
                       (nic != nullptr));
 
-            if (!atl_ofi_nic_already_used(prov_iter, topo_prov_list, topo_prov_count)) {
+            if (!atl_ofi_nic_already_used(prov_iter, topo_provs)) {
                 int is_local = atl_ofi_is_nic_local(prov_iter);
                 LOG_DEBUG(
                     "topo filter: nic ", atl_ofi_get_nic_name(prov_iter), ", is_local ", is_local);
                 if (ofi_ctx->mnic_type == ATL_MNIC_GLOBAL ||
                     (ofi_ctx->mnic_type == ATL_MNIC_LOCAL && is_local)) {
                     LOG_DEBUG("topo filter: found suitable nic ", atl_ofi_get_nic_name(prov_iter));
-                    topo_prov_list[topo_prov_count] = fi_dupinfo(prov_iter);
-                    topo_prov_count++;
+                    topo_provs.push_back(fi_dupinfo(prov_iter));
                 }
             }
             else {
@@ -1253,58 +1321,64 @@ atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
         }
     }
 
-    if (!topo_prov_count) {
+    if (topo_provs.empty()) {
         LOG_ERROR("topo filter: can not find network providers, mnic_type ", ofi_ctx->mnic_type);
         goto err;
     }
 
-    /* 4. filter out by count */
-    for (idx = 0; idx < topo_prov_count; idx++) {
-        prov_iter = topo_prov_list[idx];
+    /* 4. reorder according to desired offset */
+    if (ofi_ctx->mnic_offset == ATL_MNIC_OFFSET_LOCAL_PROC_IDX) {
+        prov_offset = ctx->coord.local_idx % topo_provs.size();
+    }
+    LOG_DEBUG("rotate: prov_offset ", prov_offset, ", vec_size ", topo_provs.size());
+    std::rotate(topo_provs.begin(), topo_provs.begin() + prov_offset, topo_provs.end());
+
+    /* 5. filter out by count */
+    for (idx = 0; idx < topo_provs.size(); idx++) {
+        prov_iter = topo_provs[idx];
         LOG_DEBUG("count filter: check nic ", atl_ofi_get_nic_name(prov_iter));
-        if (final_prov_count < ofi_ctx->mnic_count) {
+        if (final_provs.size() < ofi_ctx->mnic_count) {
             LOG_DEBUG("count filter: found suitable nic ",
                       atl_ofi_get_nic_name(prov_iter),
                       ", nic idx ",
-                      final_prov_count);
-            final_prov_list[final_prov_count] = fi_dupinfo(prov_iter);
-            final_prov_count++;
+                      final_provs.size());
+            final_provs.push_back(fi_dupinfo(prov_iter));
         }
         else {
             break;
         }
     }
 
-    if (!final_prov_count) {
+    if (final_provs.empty()) {
         LOG_ERROR("count filter: can not find network providers, mnic_count ", ofi_ctx->mnic_count);
         goto err;
     }
 
-    /* 5. create network providers */
-    LOG_INFO("found ", final_prov_count, " nic(s) according to all filters");
-    ofi_ctx->nw_prov_count = final_prov_count;
+    /* 6. create network providers */
+    LOG_INFO("found ", final_provs.size(), " nic(s) according to all filters");
+    ofi_ctx->nw_prov_count = final_provs.size();
     for (idx = 0; idx < ofi_ctx->nw_prov_count; idx++) {
         prov_idx = ofi_ctx->nw_prov_first_idx + idx;
         prov = &ofi_ctx->provs[prov_idx];
         prov->idx = prov_idx;
         prov->is_shm = 0;
-        ATL_CALL(atl_ofi_prov_init(ctx, final_prov_list[idx], prov, attr, pmi), goto err);
+        ATL_CALL(atl_ofi_prov_init(ctx, final_provs[idx], prov, attr, pmi), goto err);
     }
 
 exit:
-    for (idx = 0; idx < final_prov_count; idx++) {
-        if (final_prov_list[idx])
-            fi_freeinfo(final_prov_list[idx]);
+    for (idx = 0; idx < final_provs.size(); idx++) {
+        if (final_provs[idx])
+            fi_freeinfo(final_provs[idx]);
     }
 
-    for (idx = 0; idx < topo_prov_count; idx++) {
-        if (topo_prov_list[idx])
-            fi_freeinfo(topo_prov_list[idx]);
+    for (idx = 0; idx < topo_provs.size(); idx++) {
+        if (topo_provs[idx])
+            fi_freeinfo(topo_provs[idx]);
     }
 
-    for (idx = 0; idx < name_prov_count; idx++) {
-        if (name_prov_list[idx])
-            fi_freeinfo(name_prov_list[idx]);
+    for (idx = 0; idx < name_provs.size(); idx++) {
+        if (name_provs[idx])
+            fi_freeinfo(name_provs[idx]);
     }
 
     fi_freeinfo(prov_list);
@@ -1318,3 +1392,11 @@ atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
     ret = ATL_STATUS_FAILURE;
     goto exit;
 }
+
+void atl_ofi_init_req(atl_req_t* req, atl_ofi_prov_ep_t* prov_ep, struct fid_ep* fi_ep) {
+    atl_ofi_req_t* ofi_req = ((atl_ofi_req_t*)req->internal);
+    ofi_req->prov_ep = prov_ep;
+    ofi_req->fi_ep = fi_ep;
+    ofi_req->comp_state = ATL_OFI_COMP_POSTED;
+    req->is_completed = 0;
+}
diff --git a/src/atl/ofi/atl_ofi_helper.hpp b/src/atl/ofi/atl_ofi_helper.hpp
index 59e776e24..c8cc040af 100644
--- a/src/atl/ofi/atl_ofi_helper.hpp
+++ b/src/atl/ofi/atl_ofi_helper.hpp
@@ -34,11 +34,11 @@
 #include <unistd.h>
 #include <errno.h>
 
-#include "atl.h"
+#include "atl/util/pm/pm_rt.h"
 #include "common/global/global.hpp"
 #include "hwloc/hwloc_wrapper.hpp"
 #ifdef CCL_ENABLE_OFI_HMEM
-#include "sched/entry/gpu/ze_primitives.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 #endif // CCL_ENABLE_OFI_HMEM
 
 #define ATL_OFI_BASE_PM_KEY     "atl-ofi"
@@ -113,7 +113,7 @@
                 CCL_THROW("OFI function error"); \
                 break; \
             } \
-            (void)atl_ep_poll(ep); \
+            (void)poll(ep); \
             retry_count++; \
         } while (((ret_val) == -FI_EAGAIN) && (retry_count < max_retry_count)); \
     } while (0)
@@ -213,6 +213,7 @@ typedef struct {
     std::vector<std::string> mnic_include_names;
     std::vector<std::string> mnic_exclude_names;
     size_t mnic_count;
+    atl_mnic_offset_t mnic_offset;
     int enable_hmem;
 } atl_ofi_ctx_t;
 
@@ -275,14 +276,14 @@ std::string atl_ofi_get_short_nic_name(const struct fi_info* prov);
 std::string atl_ofi_get_nic_name(const struct fi_info* prov);
 atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_size);
 fi_addr_t atl_ofi_get_addr(atl_ctx_t* ctx, atl_ofi_prov_t* prov, int proc_idx, size_t ep_idx);
-atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::unique_ptr<ipmi>& pmi);
+atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, std::shared_ptr<ipmi> pmi);
 atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
                                             size_t prov_idx,
-                                            std::unique_ptr<ipmi>& pmi);
+                                            std::shared_ptr<ipmi> pmi);
 atl_status_t atl_ofi_prov_ep_get_name(atl_ofi_prov_t* prov, size_t ep_idx);
 atl_status_t atl_ofi_prov_eps_connect(atl_ofi_ctx_t* ofi_ctx,
                                       size_t prov_idx,
-                                      std::unique_ptr<ipmi>& pmi);
+                                      std::shared_ptr<ipmi> pmi);
 void atl_ofi_prov_ep_destroy(atl_ofi_prov_t* prov, atl_ofi_prov_ep_t* ep);
 void atl_ofi_prov_destroy(atl_ctx_t* ctx, atl_ofi_prov_t* prov);
 int atl_ofi_wait_cancel_cq(struct fid_cq* cq);
@@ -300,13 +301,12 @@ atl_status_t atl_ofi_prov_init(atl_ctx_t* ctx,
                                struct fi_info* info,
                                atl_ofi_prov_t* prov,
                                atl_attr_t* attr,
-                               std::unique_ptr<ipmi>& pmi);
+                               std::shared_ptr<ipmi> pmi);
 atl_status_t atl_ofi_adjust_out_tag(atl_ofi_prov_t* prov, atl_attr_t* attr);
-int atl_ofi_nic_already_used(const struct fi_info* prov, struct fi_info** others, size_t nic_count);
-int atl_ofi_is_nic_local(struct fi_info* info);
 atl_status_t atl_ofi_parse_mnic_name(atl_ctx_t* ctx, std::string str_to_parse);
 int atl_ofi_is_allowed_nic_name(atl_ofi_ctx_t* ofi_ctx, struct fi_info* info);
 atl_status_t atl_ofi_open_nw_provs(atl_ctx_t* ctx,
                                    struct fi_info* base_hints,
                                    atl_attr_t* attr,
-                                   std::unique_ptr<ipmi>& pmi);
+                                   std::shared_ptr<ipmi> pmi);
+void atl_ofi_init_req(atl_req_t* req, atl_ofi_prov_ep_t* prov_ep, struct fid_ep* fi_ep);
diff --git a/src/atl/util/pm/pm_rt.h b/src/atl/util/pm/pm_rt.h
index ac328fddc..e896b4d52 100644
--- a/src/atl/util/pm/pm_rt.h
+++ b/src/atl/util/pm/pm_rt.h
@@ -148,7 +148,7 @@ static inline atl_status_t pmrt_kvs_get(pm_rt_desc_t *pmrt_desc,
 #ifdef __cplusplus
 class ipmi {
 public:
-    virtual ~ipmi() = default;
+    virtual ~ipmi() noexcept(false){};
 
     virtual int is_pm_resize_enabled() = 0;
 
@@ -160,9 +160,9 @@ class ipmi {
 
     virtual atl_status_t pmrt_wait_notification() = 0;
 
-    virtual void pmrt_finalize() = 0;
+    virtual atl_status_t pmrt_finalize() = 0;
 
-    virtual void pmrt_barrier() = 0;
+    virtual atl_status_t pmrt_barrier() = 0;
 
     virtual atl_status_t pmrt_kvs_put(char *kvs_key,
                                       int proc_idx,
@@ -180,13 +180,15 @@ class ipmi {
 
     virtual size_t get_local_thread_idx() = 0;
 
-    virtual size_t get_local_kvs_id() = 0;
+    virtual atl_status_t get_local_kvs_id(size_t &res) = 0;
 
-    virtual void set_local_kvs_id(size_t local_kvs_id) = 0;
+    virtual atl_status_t set_local_kvs_id(size_t local_kvs_id) = 0;
 
     virtual size_t get_threads_per_process() = 0;
 
     virtual size_t get_ranks_per_process() = 0;
+
+    virtual atl_status_t pmrt_init() = 0;
 };
 #endif
 #endif // PM_RT_H
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
index ec1f9ba30..b642f3b5d 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
@@ -26,52 +26,54 @@ int pmi_resizable::is_pm_resize_enabled() {
     return true;
 }
 
-atl_status_t pmi_resizable::pmrt_init(const char *main_addr) {
-    int ret;
+atl_status_t pmi_resizable::pmrt_init() {
+    kvs_status_t ret;
     size_t max_kvsnamelen;
 
-    ret = PMIR_Init(main_addr);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
+    KVS_2_ATL_CHECK_STATUS(PMIR_Init(main_addr.data()), "failed to init");
 
-    ret = PMIR_Update();
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
+    KVS_2_ATL_CHECK_STATUS(PMIR_Update(), "failed to update");
 
     ret = PMIR_Get_size(&size);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
     ret = PMIR_Get_rank(&rank);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
 
     ret = PMIR_KVS_Get_name_length_max(&max_kvsnamelen);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
 
     kvsname = (char *)calloc(1, max_kvsnamelen);
-    if (!kvsname)
+    if (!kvsname) {
+        LOG_ERROR("memory allocaion failed");
         goto err_resizable;
+    }
 
     ret = PMIR_KVS_Get_my_name(kvsname, max_kvsnamelen);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_alloc_key;
 
     ret = PMIR_KVS_Get_key_length_max(&max_keylen);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_alloc_key;
 
     key_storage = (char *)calloc(1, max_keylen);
-    if (!key_storage)
+    if (!key_storage) {
+        LOG_ERROR("memory allocaion failed");
         goto err_alloc_key;
+    }
 
     ret = PMIR_KVS_Get_value_length_max(&max_vallen);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_alloc_val;
 
     val_storage = (char *)calloc(1, max_vallen);
-    if (!val_storage)
+    if (!val_storage) {
+        LOG_ERROR("memory allocaion failed");
         goto err_alloc_val;
+    }
 
     initialized = true;
 
@@ -82,76 +84,80 @@ atl_status_t pmi_resizable::pmrt_init(const char *main_addr) {
     free(kvsname);
 err_resizable:
     PMIR_Finalize();
+    LOG_ERROR("failed");
     return ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_resizable::pmrt_main_addr_reserve(char *main_addr) {
-    int ret = PMIR_Main_Addr_Reserve(main_addr);
-
-    if (ret)
+    if (PMIR_Main_Addr_Reserve(main_addr) != KVS_STATUS_SUCCESS)
         return ATL_STATUS_FAILURE;
 
     return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_resizable::pmrt_set_resize_function(atl_resize_fn_t resize_fn) {
-    int ret = PMIR_set_resize_function((pmir_resize_fn_t)resize_fn);
-
-    if (ret)
+    if (PMIR_set_resize_function((pmir_resize_fn_t)resize_fn) != KVS_STATUS_SUCCESS)
         return ATL_STATUS_FAILURE;
 
     return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_resizable::pmrt_update() {
-    int ret;
+    kvs_status_t ret;
     ret = PMIR_Update();
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
 
     ret = PMIR_Get_size(&size);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
 
     ret = PMIR_Get_rank(&rank);
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         goto err_resizable;
 
     return ATL_STATUS_SUCCESS;
 
 err_resizable:
     PMIR_Finalize();
+    LOG_ERROR("failed");
     return ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_resizable::pmrt_wait_notification() {
-    int ret;
+    kvs_status_t ret;
 
     ret = PMIR_Wait_notification();
 
-    if (ret != PMIR_SUCCESS)
+    if (ret != KVS_STATUS_SUCCESS)
         return ATL_STATUS_FAILURE;
 
     return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable::pmrt_finalize() {
+atl_status_t pmi_resizable::pmrt_finalize() {
     is_finalized = true;
     if (!initialized)
-        return;
+        return ATL_STATUS_SUCCESS;
 
     free(kvsname);
     free(key_storage);
     free(val_storage);
 
-    PMIR_Finalize();
+    if (PMIR_Finalize() != KVS_STATUS_SUCCESS) {
+        return ATL_STATUS_FAILURE;
+    }
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable::pmrt_barrier() {
+atl_status_t pmi_resizable::pmrt_barrier() {
     if (!initialized)
-        return;
+        return ATL_STATUS_SUCCESS;
 
-    PMIR_Barrier();
+    if (PMIR_Barrier() != KVS_STATUS_SUCCESS) {
+        return ATL_STATUS_FAILURE;
+    }
+    return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_resizable::pmrt_kvs_put(char *kvs_key,
@@ -160,28 +166,31 @@ atl_status_t pmi_resizable::pmrt_kvs_put(char *kvs_key,
                                          size_t kvs_val_len) {
     int ret;
 
-    if (!initialized)
+    if (!initialized) {
+        LOG_ERROR("not initialized yet")
         return ATL_STATUS_FAILURE;
+    }
 
-    if (kvs_val_len > max_vallen)
+    if (kvs_val_len > max_vallen) {
+        LOG_ERROR("asked len > max len");
         return ATL_STATUS_FAILURE;
+    }
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
     ret = encode(kvs_val, kvs_val_len, val_storage, max_vallen);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("encode failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    ret = PMIR_KVS_Put(kvsname, key_storage, val_storage);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_KVS_Commit(kvsname);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
+    KVS_2_ATL_CHECK_STATUS(PMIR_KVS_Put(kvsname, key_storage, val_storage), "put failed");
 
+    KVS_2_ATL_CHECK_STATUS(PMIR_KVS_Commit(kvsname), "commit failed");
     return ATL_STATUS_SUCCESS;
 }
 
@@ -191,20 +200,25 @@ atl_status_t pmi_resizable::pmrt_kvs_get(char *kvs_key,
                                          size_t kvs_val_len) {
     int ret;
 
-    if (!initialized)
+    if (!initialized) {
+        LOG_ERROR("not initialized yet")
         return ATL_STATUS_FAILURE;
+    }
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    ret = PMIR_KVS_Get(kvsname, key_storage, val_storage, max_vallen);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
+    KVS_2_ATL_CHECK_STATUS(PMIR_KVS_Get(kvsname, key_storage, val_storage, max_vallen),
+                           "get failed");
 
     ret = decode(val_storage, kvs_val, kvs_val_len);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("decode failed");
         return ATL_STATUS_FAILURE;
+    }
 
     return ATL_STATUS_SUCCESS;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
index 8ec4023d0..d41302548 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
@@ -48,10 +48,9 @@ class helper;
 class pmi_resizable final : public ipmi {
 public:
     pmi_resizable() = delete;
-    explicit pmi_resizable(std::shared_ptr<ikvs_wrapper> k, const char* main_addr = nullptr) {
+    explicit pmi_resizable(std::shared_ptr<ikvs_wrapper> k, const char* main_addr = "")
+            : main_addr(main_addr) {
         h = std::shared_ptr<helper>(new helper(k));
-        //TODO: move it in one func
-        pmrt_init(main_addr);
     }
 
     ~pmi_resizable() override;
@@ -66,7 +65,7 @@ class pmi_resizable final : public ipmi {
 
     atl_status_t pmrt_wait_notification() override;
 
-    void pmrt_barrier() override;
+    atl_status_t pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char* kvs_key,
                               int proc_idx,
@@ -78,7 +77,7 @@ class pmi_resizable final : public ipmi {
                               void* kvs_val,
                               size_t kvs_val_len) override;
 
-    void Hard_finilize(int sig);
+    kvs_status_t hard_finalize(int sig);
 
     int get_rank() override;
 
@@ -86,9 +85,9 @@ class pmi_resizable final : public ipmi {
 
     size_t get_local_thread_idx() override;
 
-    size_t get_local_kvs_id() override;
+    atl_status_t get_local_kvs_id(size_t& res) override;
 
-    void set_local_kvs_id(size_t local_kvs_id) override;
+    atl_status_t set_local_kvs_id(size_t local_kvs_id) override;
 
     size_t get_threads_per_process() override {
         return 1;
@@ -97,49 +96,51 @@ class pmi_resizable final : public ipmi {
     size_t get_ranks_per_process() override {
         return 1;
     }
-    void pmrt_finalize() override;
+    atl_status_t pmrt_finalize() override;
+
+    atl_status_t pmrt_init() override;
 
 private:
     bool is_finalized{ false };
-    atl_status_t pmrt_init(const char* main_addr = nullptr);
     /*Was in API ->*/
-    int PMIR_Main_Addr_Reserve(char* main_addr);
+    kvs_status_t PMIR_Main_Addr_Reserve(char* main_addr);
 
-    int PMIR_Init(const char* main_addr);
+    kvs_status_t PMIR_Init(const char* main_addr);
 
-    int PMIR_Finalize(void);
+    kvs_status_t PMIR_Finalize(void);
 
-    int PMIR_Get_size(int* size);
+    kvs_status_t PMIR_Get_size(int* size);
 
-    int PMIR_Get_rank(int* rank);
+    kvs_status_t PMIR_Get_rank(int* rank);
 
-    int PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
+    kvs_status_t PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
 
-    int PMIR_KVS_Get_name_length_max(size_t* length);
+    kvs_status_t PMIR_KVS_Get_name_length_max(size_t* length);
 
-    int PMIR_Barrier(void);
+    kvs_status_t PMIR_Barrier(void);
 
-    int PMIR_Update(void);
+    kvs_status_t PMIR_Update(void);
 
-    int PMIR_KVS_Get_key_length_max(size_t* length);
+    kvs_status_t PMIR_KVS_Get_key_length_max(size_t* length);
 
-    int PMIR_KVS_Get_value_length_max(size_t* length);
+    kvs_status_t PMIR_KVS_Get_value_length_max(size_t* length);
 
-    int PMIR_KVS_Put(const char* kvs_name, const char* key, const char* value);
+    kvs_status_t PMIR_KVS_Put(const char* kvs_name, const char* key, const char* value);
 
-    int PMIR_KVS_Commit(const char* kvs_name);
+    kvs_status_t PMIR_KVS_Commit(const char* kvs_name);
 
-    int PMIR_KVS_Get(const char* kvs_name, const char* key, char* value, size_t length);
+    kvs_status_t PMIR_KVS_Get(const char* kvs_name, const char* key, char* value, size_t length);
 
-    int PMIR_set_resize_function(pmir_resize_fn_t resize_fn);
+    kvs_status_t PMIR_set_resize_function(pmir_resize_fn_t resize_fn);
 
-    int PMIR_Wait_notification(void);
+    kvs_status_t PMIR_Wait_notification(void);
     /* <- Was in API*/
     kvs_resize_action_t default_checker(int comm_size);
     kvs_resize_action_t call_resize_fn(int comm_size);
 
     int rank;
     int size;
+    std::string main_addr;
 
     pmir_resize_fn_t resize_function = nullptr;
     std::shared_ptr<helper> h;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
index 98b06efc6..e6fa97330 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
@@ -24,13 +24,22 @@
 
 #include "common/log/log.hpp"
 
+typedef enum { KVS_STATUS_SUCCESS, KVS_STATUS_FAILURE, KVS_STATUS_UNSUPPORTED } kvs_status_t;
+
+#define KVS_CHECK_STATUS(expr, str) \
+    do { \
+        if (expr != KVS_STATUS_SUCCESS) { \
+            LOG_ERROR(str); \
+            return KVS_STATUS_FAILURE; \
+        } \
+    } while (0)
+
 //TODO: change exit to something more useful
 #define SET_STR(dst, size, ...) \
     do { \
         if (snprintf(dst, size, __VA_ARGS__) > size) { \
-            printf("line too long (must be shorter %d)\n", size); \
-            printf(__VA_ARGS__); \
-            exit(1); \
+            LOG_ERROR("line too long, must be shorter ", size); \
+            return KVS_STATUS_FAILURE; \
         } \
     } while (0)
 
@@ -38,8 +47,8 @@
     do { \
         char* res = expr; \
         if (!res || res != str) { \
-            printf("fgets error\n"); \
-            exit(EXIT_FAILURE); \
+            LOG_ERROR("fgets error: ", strerror(errno)); \
+            return KVS_STATUS_FAILURE; \
         } \
     } while (0)
 
@@ -61,15 +70,16 @@
                                buf, \
                                size, \
                                shift); \
-                        perror("read/write error"); \
-                        exit(EXIT_FAILURE); \
+                        LOG_ERROR("read/write error: ", strerror(errno)); \
+                        return KVS_STATUS_FAILURE; \
                     } \
                 } \
                 else if (res == 0) { \
-                    printf("" #msg ": " #op ": can not process all data, size %zu, shift %zu\n", \
-                           size, \
-                           shift); \
-                    exit(EXIT_FAILURE); \
+                    LOG_ERROR("" #msg ": " #op \
+                              ": can not process all data, size %zu, shift %zu\n", \
+                              size, \
+                              shift); \
+                    return KVS_STATUS_FAILURE; \
                 } \
                 else { \
                     shift += res; \
@@ -94,8 +104,8 @@
                            buf, \
                            size, \
                            shift); \
-                    perror("read/write error"); \
-                    exit(EXIT_FAILURE); \
+                    LOG_ERROR("read/write error: ", strerror(errno)); \
+                    return KVS_STATUS_FAILURE; \
                 } \
             } \
             else { \
@@ -159,21 +169,23 @@ void inline kvs_str_copy_known_sizes(char* dst, const char* src, size_t bytes) {
     dst[bytes - 1] = '\0';
 }
 
-long int inline safe_strtol(const char* str, char** endptr, int base) {
+template <typename T>
+kvs_status_t inline safe_strtol(const char* str, T& val) {
     errno = 0;
-    auto val = strtol(str, endptr, base);
+    val = strtol(str, nullptr, 10);
 
     if (errno != 0) {
         if (errno == EINVAL) {
-            CCL_THROW("conversion error occurred from: ", str);
+            LOG_ERROR("conversion error occurred from: ", str);
         }
         else if (errno == ERANGE) {
-            CCL_THROW("the value provided was out of range: ", str);
+            LOG_ERROR("the value provided was out of range: ", str);
         }
         else {
-            CCL_THROW("strtol error: ", strerror(errno), ", str: ", str);
+            LOG_ERROR("strtol error: ", strerror(errno), ", str: ", str);
         }
+        return KVS_STATUS_FAILURE;
     }
 
-    return val;
+    return KVS_STATUS_SUCCESS;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
index 4aa2e298f..e5516cf06 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
@@ -24,14 +24,16 @@ size_t barrier_num = 0;
 size_t up_idx;
 size_t applied = 0;
 
-rank_list_t* killed_ranks = NULL;
+std::list<int> killed_ranks;
 int killed_ranks_count = 0;
 
-rank_list_t* new_ranks = NULL;
+std::list<int> new_ranks;
 int new_ranks_count = 0;
 
-size_t helper::replace_str(char* str, int old_rank, int new_rank) {
-    throw std::runtime_error("unexpected path");
+kvs_status_t helper::replace_str(char* str, int old_rank, int new_rank) {
+    //    throw std::runtime_error("unexpected path");
+    LOG_ERROR("unexpected path");
+    return KVS_STATUS_FAILURE;
 
     char old_str[INT_STR_SIZE];
     char new_str[INT_STR_SIZE];
@@ -43,8 +45,10 @@ size_t helper::replace_str(char* str, int old_rank, int new_rank) {
     SET_STR(new_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
 
     point_to_replace = strstr(str, old_str);
-    if (point_to_replace == NULL)
-        return 1;
+    if (point_to_replace == NULL) {
+        LOG_ERROR("not found old rank(%d) in str(%s)", old_rank, str);
+        return KVS_STATUS_FAILURE;
+    }
 
     old_str_size = strlen(old_str);
     new_str_size = strlen(new_str);
@@ -54,25 +58,31 @@ size_t helper::replace_str(char* str, int old_rank, int new_rank) {
         memmove(point_to_replace + new_str_size, point_to_replace + old_str_size, rest_len);
     }
     memcpy(point_to_replace, new_str, new_str_size);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::update_ranks(int* old_count, rank_list_t** origin_list, const char* kvs_name) {
+kvs_status_t helper::update_ranks(int* old_count,
+                                  std::list<int>& origin_list,
+                                  const char* kvs_name) {
     char** rank_nums = NULL;
-    size_t rank_count = get_keys_values_by_name(kvs_name, NULL, &rank_nums);
+    size_t rank_count;
+    KVS_CHECK_STATUS(get_keys_values_by_name(kvs_name, NULL, &rank_nums, rank_count),
+                     "failed to get values by name");
     size_t i;
     size_t cur_count = 0;
 
     if (rank_count == 0) {
         // *old_count = 0;
-        return;
+        return KVS_STATUS_SUCCESS;
     }
-
+    int rank_num;
     for (i = 0; i < rank_count; i++) {
-        if (rank_list_contains(*origin_list, safe_strtol(rank_nums[i], NULL, 10)))
+        KVS_CHECK_STATUS(safe_strtol(rank_nums[i], rank_num), "failed to to convert rank_num");
+
+        if (std::find(origin_list.begin(), origin_list.end(), rank_num) != origin_list.end())
             continue;
 
-        rank_list_add(origin_list, safe_strtol(rank_nums[i], NULL, 10));
+        origin_list.push_back(rank_num);
         cur_count++;
     }
 
@@ -82,91 +92,100 @@ void helper::update_ranks(int* old_count, rank_list_t** origin_list, const char*
     free(rank_nums);
 
     *old_count += cur_count;
+    return KVS_STATUS_SUCCESS;
 }
 
 void helper::keep_first_n_up(int prev_new_ranks_count, int prev_killed_ranks_count) {
-    rank_list_keep_first_n(&killed_ranks, prev_killed_ranks_count);
-    rank_list_keep_first_n(&new_ranks, prev_new_ranks_count);
+    killed_ranks.resize(prev_killed_ranks_count);
+    new_ranks.resize(prev_new_ranks_count);
 }
 
-void helper::get_update_ranks(void) {
-    update_ranks(&killed_ranks_count, &killed_ranks, KVS_APPROVED_DEAD_POD);
-    update_ranks(&new_ranks_count, &new_ranks, KVS_APPROVED_NEW_POD);
+kvs_status_t helper::get_update_ranks(void) {
+    KVS_CHECK_STATUS(update_ranks(&killed_ranks_count, killed_ranks, KVS_APPROVED_DEAD_POD),
+                     "failed to update killed ranks");
+    KVS_CHECK_STATUS(update_ranks(&new_ranks_count, new_ranks, KVS_APPROVED_NEW_POD),
+                     "failed to update new ranks");
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::get_shift(shift_list_t** list) {
+void helper::get_shift(std::list<shift_rank_t>& list) {
     int shift_pods_count = 0;
     int max_rank_survivor_pod = count_pods;
-    rank_list_t* cur_new = new_ranks;
-    rank_list_t* cur_killed = killed_ranks;
-
-    if (killed_ranks != NULL)
-        rank_list_sort(killed_ranks);
-    if (new_ranks != NULL)
-        rank_list_sort(new_ranks);
-
-    while (cur_killed != NULL) {
-        if (cur_new != NULL) {
-            shift_list_add(list, cur_killed->rank, cur_killed->rank, CH_T_UPDATE);
-            cur_new = cur_new->next;
+    new_ranks.sort();
+    killed_ranks.sort();
+    auto cur_new = new_ranks.begin();
+    auto cur_killed = killed_ranks.begin();
+
+    while (cur_killed != killed_ranks.end()) {
+        if (cur_new != new_ranks.end()) {
+            list.push_back({ *cur_killed, *cur_killed, CH_T_UPDATE });
+            cur_new++;
         }
         else {
-            while (rank_list_contains(cur_killed, max_rank_survivor_pod - shift_pods_count - 1) ==
-                   1)
+            while (std::find(cur_killed,
+                             killed_ranks.end(),
+                             max_rank_survivor_pod - shift_pods_count - 1) != killed_ranks.end()) {
                 max_rank_survivor_pod--;
+            }
 
-            if (cur_killed->rank < max_rank_survivor_pod - shift_pods_count) {
-                shift_list_add(list,
-                               max_rank_survivor_pod - shift_pods_count - 1,
-                               cur_killed->rank,
-                               CH_T_SHIFT);
+            if (*cur_killed < max_rank_survivor_pod - shift_pods_count) {
+                list.push_back(
+                    { max_rank_survivor_pod - shift_pods_count - 1, *cur_killed, CH_T_SHIFT });
                 shift_pods_count++;
             }
             else {
-                while (cur_killed != NULL) {
-                    shift_list_add(list, cur_killed->rank, cur_killed->rank, CH_T_DEAD);
-                    cur_killed = cur_killed->next;
+                while (cur_killed != killed_ranks.end()) {
+                    list.push_back({ *cur_killed, *cur_killed, CH_T_DEAD });
+                    cur_killed++;
                 }
                 break;
             }
         }
-        cur_killed = cur_killed->next;
+        cur_killed++;
     }
-    while (cur_new != NULL) {
-        shift_list_add(list, cur_new->rank, cur_new->rank, CH_T_NEW);
-        cur_new = cur_new->next;
+    while (cur_new != new_ranks.end()) {
+        list.push_back({ *cur_new, *cur_new, CH_T_NEW });
+        cur_new++;
     }
 }
 
-void helper::up_pods_count(void) {
-    count_pods = get_count_names(KVS_POD_NUM);
+kvs_status_t helper::up_pods_count(void) {
+    KVS_CHECK_STATUS(get_count_names(KVS_POD_NUM, count_pods), "failed to get count names");
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::wait_accept(void) {
+kvs_status_t helper::wait_accept(void) {
     char my_rank_str[MAX_KVS_VAL_LENGTH];
 
     my_rank = 0;
 
     while (1) {
-        if (get_value_by_name_key(KVS_ACCEPT, my_hostname, my_rank_str) != 0) {
-            my_rank = safe_strtol(my_rank_str, NULL, 10);
-            break;
-        }
+        KVS_CHECK_STATUS(get_value_by_name_key(KVS_ACCEPT, my_hostname, my_rank_str),
+                         "failed to get value");
+        if (strlen(my_rank_str) == 0)
+            continue;
+        KVS_CHECK_STATUS(safe_strtol(my_rank_str, my_rank), "failed to convert my_rank");
+        break;
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::clean_dead_pods_info(rank_list_t* dead_up_idx) {
+kvs_status_t helper::clean_dead_pods_info(std::list<int>& dead_up_idx) {
     size_t i;
     size_t count_death;
     char** kvs_keys = NULL;
+    auto it = dead_up_idx.begin();
 
-    while (dead_up_idx != NULL) {
-        count_death = get_keys_values_by_name(KVS_APPROVED_DEAD_POD, &kvs_keys, NULL);
+    while (it != dead_up_idx.end()) {
+        KVS_CHECK_STATUS(
+            get_keys_values_by_name(KVS_APPROVED_DEAD_POD, &kvs_keys, NULL, count_death),
+            "failed to get keys and values");
 
         for (i = 0; i < count_death; i++) {
-            remove_name_key(KVS_APPROVED_DEAD_POD, kvs_keys[i]);
-            dead_up_idx = dead_up_idx->next;
-            if (dead_up_idx == NULL) {
+            KVS_CHECK_STATUS(remove_name_key(KVS_APPROVED_DEAD_POD, kvs_keys[i]),
+                             "failed to remove name and key");
+            it++;
+            if (it == dead_up_idx.end()) {
                 for (; i < count_death; i++) {
                     free(kvs_keys[i]);
                 }
@@ -177,9 +196,10 @@ void helper::clean_dead_pods_info(rank_list_t* dead_up_idx) {
     }
     if (kvs_keys != NULL)
         free(kvs_keys);
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::accept_new_ranks(shift_list_t* cur_list) {
+kvs_status_t helper::accept_new_ranks(const std::list<shift_rank_t>& list) {
     char new_rank_str[INT_STR_SIZE];
     char old_rank_str[INT_STR_SIZE];
     char** kvs_values = NULL;
@@ -187,16 +207,19 @@ void helper::accept_new_ranks(shift_list_t* cur_list) {
     size_t count_values;
     size_t i = 0;
 
-    while (cur_list != NULL) {
-        if (cur_list->shift.type == CH_T_UPDATE || cur_list->shift.type == CH_T_NEW) {
-            SET_STR(old_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list->shift.old_rank);
-            SET_STR(new_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list->shift.new_rank);
+    for (const auto& cur_list : list) {
+        if (cur_list.type == CH_T_UPDATE || cur_list.type == CH_T_NEW) {
+            SET_STR(old_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list.old_rank);
+            SET_STR(new_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list.new_rank);
 
-            count_values = get_keys_values_by_name(KVS_APPROVED_NEW_POD, &kvs_keys, &kvs_values);
+            KVS_CHECK_STATUS(
+                get_keys_values_by_name(KVS_APPROVED_NEW_POD, &kvs_keys, &kvs_values, count_values),
+                "failed to get keys and values");
 
             for (i = 0; i < count_values; i++) {
                 if (!strcmp(kvs_values[i], old_rank_str)) {
-                    set_value(KVS_ACCEPT, kvs_keys[i], new_rank_str);
+                    KVS_CHECK_STATUS(set_value(KVS_ACCEPT, kvs_keys[i], new_rank_str),
+                                     "failed to set value");
                     break;
                 }
             }
@@ -205,22 +228,24 @@ void helper::accept_new_ranks(shift_list_t* cur_list) {
                 free(kvs_values[i]);
             }
         }
-        cur_list = cur_list->next;
     }
 
-    while ((count_values = get_keys_values_by_name(KVS_ACCEPT, NULL, &kvs_values)) != 0) {
+    do {
+        KVS_CHECK_STATUS(get_keys_values_by_name(KVS_ACCEPT, NULL, &kvs_values, count_values),
+                         "failed to get keys and values");
         for (i = 0; i < count_values; i++) {
             free(kvs_values[i]);
         }
-    }
+    } while (count_values != 0);
 
     if (kvs_keys != NULL)
         free(kvs_keys);
     if (kvs_values != NULL)
         free(kvs_values);
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::update_kvs_info(int new_rank) {
+kvs_status_t helper::update_kvs_info(int new_rank) {
     char kvs_name[MAX_KVS_NAME_LENGTH];
     char kvs_key[MAX_KVS_KEY_LENGTH];
     char kvs_val[MAX_KVS_VAL_LENGTH];
@@ -230,61 +255,66 @@ void helper::update_kvs_info(int new_rank) {
     for (k = 0; k < kvs_list_size; k++) {
         cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT);
 
-        remove_name_key(kvs_name, kvs_key);
+        KVS_CHECK_STATUS(remove_name_key(kvs_name, kvs_key), "failed to remove name and key");
 
-        replace_str(kvs_key, my_rank, new_rank);
+        KVS_CHECK_STATUS(replace_str(kvs_key, my_rank, new_rank), "failed to replace str");
 
-        set_value(kvs_name, kvs_key, kvs_val);
+        KVS_CHECK_STATUS(set_value(kvs_name, kvs_key, kvs_val), "failed to set value");
 
         put_key(kvs_name, kvs_key, kvs_val, ST_CLIENT);
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::move_to_new_rank(int new_rank) {
+kvs_status_t helper::move_to_new_rank(int new_rank) {
     char rank_str[INT_STR_SIZE];
 
-    update_kvs_info(new_rank);
+    KVS_CHECK_STATUS(update_kvs_info(new_rank), "failed to update kvs info");
     my_rank = new_rank;
 
-    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
+    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
 
     //    request_set_val(KVS_POD_REQUEST, my_hostname, rank_str);
 
-    set_value(KVS_POD_NUM, rank_str, my_hostname);
+    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, rank_str, my_hostname), "failed to update kvs info");
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::update_my_info(shift_list_t* list) {
+kvs_status_t helper::update_my_info(const std::list<shift_rank_t>& list) {
     char rank_str[INT_STR_SIZE];
 
-    while (list != NULL) {
-        if (list->shift.old_rank == my_rank) {
+    for (const auto& it : list) {
+        if (it.old_rank == static_cast<int>(my_rank)) {
             int old_rank = my_rank;
-            move_to_new_rank(list->shift.new_rank);
+            KVS_CHECK_STATUS(move_to_new_rank(it.new_rank), "failed to move to new rank");
 
             SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, old_rank);
 
-            remove_name_key(KVS_POD_NUM, rank_str);
+            KVS_CHECK_STATUS(remove_name_key(KVS_POD_NUM, rank_str),
+                             "failed to remove name and key");
 
             break;
         }
-        list = list->next;
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_barrier_idx(void) {
+kvs_status_t helper::get_barrier_idx(size_t& barrier_num_out) {
     char** kvs_values = NULL;
     size_t count_kvs_values = 0;
     size_t tmp_barrier_num;
     size_t min_barrier_num;
     size_t i = 0;
 
-    count_kvs_values = get_keys_values_by_name(KVS_BARRIER, NULL, &kvs_values);
+    KVS_CHECK_STATUS(get_keys_values_by_name(KVS_BARRIER, NULL, &kvs_values, count_kvs_values),
+                     "failed to get keys and values");
     if (count_kvs_values == 0)
-        return 0;
+        return KVS_STATUS_SUCCESS;
 
-    min_barrier_num = safe_strtol(kvs_values[0], NULL, 10);
+    KVS_CHECK_STATUS(safe_strtol(kvs_values[0], min_barrier_num), "failed to convert barrier num");
     for (i = 1; i < count_kvs_values; i++) {
-        tmp_barrier_num = safe_strtol(kvs_values[i], NULL, 10);
+        KVS_CHECK_STATUS(safe_strtol(kvs_values[i], tmp_barrier_num),
+                         "failed to convert tmp barrier num");
         if (min_barrier_num > tmp_barrier_num)
             min_barrier_num = tmp_barrier_num;
     }
@@ -292,10 +322,12 @@ size_t helper::get_barrier_idx(void) {
         free(kvs_values[i]);
     }
     free(kvs_values);
-    return min_barrier_num;
+
+    barrier_num_out = min_barrier_num;
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::post_my_info(void) {
+kvs_status_t helper::post_my_info(void) {
     char barrier_num_str[INT_STR_SIZE];
     char my_rank_str[INT_STR_SIZE];
 
@@ -303,106 +335,120 @@ void helper::post_my_info(void) {
 
     SET_STR(my_rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
-    set_value(KVS_POD_NUM, my_rank_str, my_hostname);
+    KVS_CHECK_STATUS(set_value(KVS_POD_NUM, my_rank_str, my_hostname), "failed to set rank");
 
-    barrier_num = get_barrier_idx();
+    KVS_CHECK_STATUS(get_barrier_idx(barrier_num), "failed to get barrier idx");
 
     SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    set_value(KVS_BARRIER, my_hostname, barrier_num_str);
+    KVS_CHECK_STATUS(set_value(KVS_BARRIER, my_hostname, barrier_num_str),
+                     "failed to set barrier idx");
 
-    remove_name_key(KVS_ACCEPT, my_hostname);
+    KVS_CHECK_STATUS(remove_name_key(KVS_ACCEPT, my_hostname),
+                     "failed to remove accepted hostname");
 
-    remove_name_key(KVS_APPROVED_NEW_POD, my_hostname);
+    KVS_CHECK_STATUS(remove_name_key(KVS_APPROVED_NEW_POD, my_hostname),
+                     "failed to remove approved hostname");
 
     barrier_num++;
     if (barrier_num > BARRIER_NUM_MAX)
         barrier_num = 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::update(shift_list_t** list, rank_list_t** dead_up_idx, int root_rank) {
+kvs_status_t helper::update(const std::list<shift_rank_t>& list,
+                            std::list<int>& dead_up_idx,
+                            int root_rank) {
     if (applied == 1) {
-        if ((*list) != NULL) {
-            if (my_rank == root_rank) {
-                if ((*dead_up_idx) != NULL)
-                    clean_dead_pods_info(*dead_up_idx);
-
-                accept_new_ranks(*list);
+        if (!list.empty()) {
+            if (static_cast<int>(my_rank) == root_rank) {
+                if (!dead_up_idx.empty()) {
+                    KVS_CHECK_STATUS(clean_dead_pods_info(dead_up_idx), "failed to clean dead pod");
+                }
+                KVS_CHECK_STATUS(accept_new_ranks(list), "failed to accept new ranks");
             }
-            update_my_info(*list);
+            KVS_CHECK_STATUS(update_my_info(list), "failed to update info");
         }
     }
-    else
-        post_my_info();
-
-    return 0;
+    else {
+        KVS_CHECK_STATUS(post_my_info(), "failed to post info");
+    }
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_val_count(const char* name, const char* val) {
-    size_t res = 0;
+kvs_status_t helper::get_val_count(const char* name, const char* val, size_t& res) {
+    res = 0;
     char** kvs_values = NULL;
     size_t count_values;
     size_t i;
 
-    count_values = get_keys_values_by_name(name, NULL, &kvs_values);
-
-    if (count_values == 0)
-        return res;
+    KVS_CHECK_STATUS(get_keys_values_by_name(name, NULL, &kvs_values, count_values),
+                     "failed to get keys and values");
 
-    for (i = 0; i < count_values; i++) {
-        if (!strcmp(val, kvs_values[i])) {
-            res++;
+    if (count_values != 0) {
+        for (i = 0; i < count_values; i++) {
+            if (!strcmp(val, kvs_values[i])) {
+                res++;
+            }
+            free(kvs_values[i]);
         }
-        free(kvs_values[i]);
+        free(kvs_values);
     }
-    free(kvs_values);
 
-    return res;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_occupied_ranks_count(char* rank) {
+kvs_status_t helper::get_occupied_ranks_count(char* rank, size_t& res) {
     char occupied_rank_val_str[MAX_KVS_VAL_LENGTH];
     size_t is_occupied_rank;
     size_t count_new_pod = 0;
     size_t count_seen_new_pod = 0;
 
-    is_occupied_rank =
-        (get_value_by_name_key(KVS_POD_NUM, rank, occupied_rank_val_str) == 0) ? 0 : 1;
+    KVS_CHECK_STATUS(get_value_by_name_key(KVS_POD_NUM, rank, occupied_rank_val_str),
+                     "failed to get occupied rank");
+
+    is_occupied_rank = (strlen(occupied_rank_val_str) == 0) ? 0 : 1;
 
-    count_new_pod = get_val_count(KVS_NEW_POD, rank);
+    KVS_CHECK_STATUS(get_val_count(KVS_NEW_POD, rank, count_new_pod), "failed to get mew rank");
 
-    count_seen_new_pod = get_val_count(KVS_APPROVED_NEW_POD, rank);
+    KVS_CHECK_STATUS(get_val_count(KVS_APPROVED_NEW_POD, rank, count_seen_new_pod),
+                     "failed to get new approved rank");
 
-    return is_occupied_rank + count_new_pod + count_seen_new_pod;
+    res = is_occupied_rank + count_new_pod + count_seen_new_pod;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_count_requested_ranks(char* rank) {
-    size_t count_pods_with_my_rank = 0;
+kvs_status_t helper::get_count_requested_ranks(char* rank, size_t& count_pods_with_my_rank) {
+    count_pods_with_my_rank = 0;
 
-    count_pods_with_my_rank = get_val_count(KVS_POD_REQUEST, rank);
+    KVS_CHECK_STATUS(get_val_count(KVS_POD_REQUEST, rank, count_pods_with_my_rank),
+                     "failed tp get requested ranks");
 
-    return count_pods_with_my_rank;
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::occupied_rank(char* rank) {
+kvs_status_t helper::occupied_rank(char* rank) {
     char idx_val[MAX_KVS_VAL_LENGTH];
-    size_t is_inited;
 
-    is_inited = get_value_by_name_key(KVS_UP, KVS_IDX, idx_val);
+    KVS_CHECK_STATUS(get_value_by_name_key(KVS_UP, KVS_IDX, idx_val), "failed to get ID");
 
-    if ((is_inited == 0) && (my_rank == 0)) {
-        set_value(KVS_UP, KVS_IDX, INITIAL_UPDATE_IDX);
+    if ((strlen(idx_val) == 0) && (my_rank == 0)) {
+        KVS_CHECK_STATUS(set_value(KVS_UP, KVS_IDX, INITIAL_UPDATE_IDX),
+                         "failed to set initial ID");
 
         count_pods = 1;
 
-        update(NULL, NULL, 0);
+        std::list<int> clear_list{};
+        std::list<shift_rank_t> clear_shift_list{};
+        KVS_CHECK_STATUS(update(clear_shift_list, clear_list, 0), "failed to initial update");
     }
     else {
-        set_value(KVS_NEW_POD, my_hostname, rank);
+        KVS_CHECK_STATUS(set_value(KVS_NEW_POD, my_hostname, rank), "failed to set rank");
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::reg_rank(void) {
+kvs_status_t helper::reg_rank(void) {
     char rank_str[INT_STR_SIZE];
     size_t wait_shift = 0;
     char** kvs_values = NULL;
@@ -412,7 +458,8 @@ void helper::reg_rank(void) {
     size_t i;
 
     my_rank = 0;
-    set_value(KVS_POD_REQUEST, my_hostname, INITIAL_RANK_NUM);
+    KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, my_hostname, INITIAL_RANK_NUM),
+                     "failed to set initial rank");
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
@@ -420,7 +467,9 @@ void helper::reg_rank(void) {
         wait_shift = 0;
         my_num_in_pod_request_line = 0;
 
-        count_values = get_keys_values_by_name(KVS_POD_REQUEST, &kvs_keys, &kvs_values);
+        KVS_CHECK_STATUS(
+            get_keys_values_by_name(KVS_POD_REQUEST, &kvs_keys, &kvs_values, count_values),
+            "failed to get requested pods");
 
         for (i = 0; i < count_values; i++) {
             if (!strcmp(kvs_values[i], rank_str)) {
@@ -435,13 +484,18 @@ void helper::reg_rank(void) {
         }
 
         if (my_num_in_pod_request_line == 1) {
-            if (get_occupied_ranks_count(rank_str) != 0) {
+            size_t rank_count;
+            KVS_CHECK_STATUS(get_occupied_ranks_count(rank_str, rank_count),
+                             "failed to get occupied ranks count");
+            if (rank_count != 0) {
                 wait_shift = 0;
             }
             else {
                 wait_shift = 1;
-                if (get_count_requested_ranks(rank_str) == 1) {
-                    occupied_rank(rank_str);
+                KVS_CHECK_STATUS(get_count_requested_ranks(rank_str, rank_count),
+                                 "failed to get requested ranks count");
+                if (rank_count == 1) {
+                    KVS_CHECK_STATUS(occupied_rank(rank_str), "failed to get occupied rank");
                     break;
                 }
             }
@@ -450,33 +504,38 @@ void helper::reg_rank(void) {
         if (!wait_shift) {
             my_rank++;
             SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
-            set_value(KVS_POD_REQUEST, my_hostname, rank_str);
+            KVS_CHECK_STATUS(set_value(KVS_POD_REQUEST, my_hostname, rank_str),
+                             "failed to set rank");
         }
     }
 
-    remove_name_key(KVS_POD_REQUEST, my_hostname);
+    KVS_CHECK_STATUS(remove_name_key(KVS_POD_REQUEST, my_hostname), "failed to remove host info");
 
     if (kvs_keys != NULL)
         free(kvs_keys);
     if (kvs_values != NULL)
         free(kvs_values);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_replica_size(void) {
-    return k->kvs_get_replica_size();
+kvs_status_t helper::get_replica_size(size_t& replica_size) {
+    return k->kvs_get_replica_size(replica_size);
 }
 
-void helper::up_kvs(const char* new_kvs_name, const char* old_kvs_name) {
+kvs_status_t helper::up_kvs(const char* new_kvs_name, const char* old_kvs_name) {
     char** kvs_values = NULL;
     char** kvs_keys = NULL;
     size_t i = 0;
     size_t count_values;
 
-    count_values = get_keys_values_by_name(old_kvs_name, &kvs_keys, &kvs_values);
+    KVS_CHECK_STATUS(get_keys_values_by_name(old_kvs_name, &kvs_keys, &kvs_values, count_values),
+                     "failed to get keys and values");
     for (i = 0; i < count_values; i++) {
-        remove_name_key(old_kvs_name, kvs_keys[i]);
+        KVS_CHECK_STATUS(remove_name_key(old_kvs_name, kvs_keys[i]),
+                         "failed to remove old kvs info");
 
-        set_value(new_kvs_name, kvs_keys[i], kvs_values[i]);
+        KVS_CHECK_STATUS(set_value(new_kvs_name, kvs_keys[i], kvs_values[i]),
+                         "failed to set new kvs info");
 
         free(kvs_keys[i]);
         free(kvs_values[i]);
@@ -485,48 +544,61 @@ void helper::up_kvs(const char* new_kvs_name, const char* old_kvs_name) {
         free(kvs_keys);
     if (kvs_values != NULL)
         free(kvs_values);
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::up_kvs_new_and_dead(void) {
-    up_kvs(KVS_APPROVED_NEW_POD, KVS_NEW_POD);
-    up_kvs(KVS_APPROVED_DEAD_POD, KVS_DEAD_POD);
+kvs_status_t helper::up_kvs_new_and_dead(void) {
+    KVS_CHECK_STATUS(up_kvs(KVS_APPROVED_NEW_POD, KVS_NEW_POD), "failed to update new");
+    KVS_CHECK_STATUS(up_kvs(KVS_APPROVED_DEAD_POD, KVS_DEAD_POD), "failed to update dead");
+    return KVS_STATUS_SUCCESS;
 }
 
-void helper::get_new_root(int* old_root) {
+kvs_status_t helper::get_new_root(int* old_root) {
     size_t i;
     char** rank_nums = NULL;
-    size_t rank_count = get_keys_values_by_name(KVS_DEAD_POD, NULL, &rank_nums);
+    size_t rank_count;
+    int rank_num;
+    KVS_CHECK_STATUS(get_keys_values_by_name(KVS_DEAD_POD, NULL, &rank_nums, rank_count),
+                     "failed to update new");
 
     for (i = 0; i < rank_count; i++) {
-        if (*old_root == (int)safe_strtol(rank_nums[i], NULL, 10))
+        KVS_CHECK_STATUS(safe_strtol(rank_nums[i], rank_num), "failed to update new");
+        if (*old_root == rank_num) {
             (*old_root)++;
+        }
         free(rank_nums[i]);
     }
     if (rank_nums != NULL)
         free(rank_nums);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t helper::get_keys_values_by_name(const char* kvs_name, char*** kvs_keys, char*** kvs_values) {
-    return k->kvs_get_keys_values_by_name(kvs_name, kvs_keys, kvs_values);
+kvs_status_t helper::get_keys_values_by_name(const char* kvs_name,
+                                             char*** kvs_keys,
+                                             char*** kvs_values,
+                                             size_t& count) {
+    return k->kvs_get_keys_values_by_name(kvs_name, kvs_keys, kvs_values, count);
 }
-size_t helper::set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t helper::set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
     return k->kvs_set_value(kvs_name, kvs_key, kvs_val);
 }
-size_t helper::remove_name_key(const char* kvs_name, const char* kvs_key) {
+kvs_status_t helper::remove_name_key(const char* kvs_name, const char* kvs_key) {
     return k->kvs_remove_name_key(kvs_name, kvs_key);
 }
-size_t helper::get_value_by_name_key(const char* kvs_name, const char* kvs_key, char* kvs_val) {
+kvs_status_t helper::get_value_by_name_key(const char* kvs_name,
+                                           const char* kvs_key,
+                                           char* kvs_val) {
     return k->kvs_get_value_by_name_key(kvs_name, kvs_key, kvs_val);
 }
 size_t helper::init(const char* main_addr) {
     return k->kvs_init(main_addr);
 }
-size_t helper::main_server_address_reserve(char* main_addr) {
+kvs_status_t helper::main_server_address_reserve(char* main_addr) {
     return k->kvs_main_server_address_reserve(main_addr);
 }
-size_t helper::get_count_names(const char* kvs_name) {
-    return k->kvs_get_count_names(kvs_name);
+kvs_status_t helper::get_count_names(const char* kvs_name, int& count_names) {
+    return k->kvs_get_count_names(kvs_name, count_names);
 }
-size_t helper::finalize(void) {
+kvs_status_t helper::finalize(void) {
     return k->kvs_finalize();
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
index 8d840a17d..4758cdb66 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
@@ -27,7 +27,6 @@
 #include <utility>
 
 #include "def.h"
-#include "rank_list.hpp"
 #include "shift_list.hpp"
 #include "kvs_keeper.hpp"
 #include "kvs/ikvs_wrapper.h"
@@ -37,10 +36,10 @@ extern size_t barrier_num;
 extern size_t up_idx;
 extern size_t applied;
 
-extern rank_list_t* killed_ranks;
+extern std::list<int> killed_ranks;
 extern int killed_ranks_count;
 
-extern rank_list_t* new_ranks;
+extern std::list<int> new_ranks;
 extern int new_ranks_count;
 
 class helper {
@@ -49,73 +48,78 @@ class helper {
     explicit helper(std::shared_ptr<ikvs_wrapper> k) : k(std::move(k)){};
     ~helper() = default;
 
-    void get_update_ranks(void);
+    kvs_status_t get_update_ranks(void);
 
-    size_t get_replica_size(void);
+    kvs_status_t get_replica_size(size_t& replica_size);
 
-    void wait_accept(void);
+    kvs_status_t wait_accept(void);
 
-    size_t update(shift_list_t** list, rank_list_t** dead_up_idx, int root_rank);
+    kvs_status_t update(const std::list<shift_rank_t>& list,
+                        std::list<int>& dead_up_idx,
+                        int root_rank);
 
-    void up_pods_count(void);
+    kvs_status_t up_pods_count(void);
 
-    void get_shift(shift_list_t** list);
+    void get_shift(std::list<shift_rank_t>& list);
 
-    void reg_rank(void);
+    kvs_status_t reg_rank(void);
 
-    size_t get_barrier_idx(void);
+    kvs_status_t get_barrier_idx(size_t& barrier_num_out);
 
-    void up_kvs_new_and_dead(void);
+    kvs_status_t up_kvs_new_and_dead(void);
 
     void keep_first_n_up(int prev_new_ranks_count, int prev_killed_ranks_count);
 
-    void get_new_root(int* old_root);
+    kvs_status_t get_new_root(int* old_root);
 
     /*Work with KVS, new*/
-    size_t set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val);
+    kvs_status_t set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val);
 
-    size_t remove_name_key(const char* kvs_name, const char* kvs_key);
+    kvs_status_t remove_name_key(const char* kvs_name, const char* kvs_key);
 
-    size_t get_value_by_name_key(const char* kvs_name, const char* kvs_key, char* kvs_val);
+    kvs_status_t get_value_by_name_key(const char* kvs_name, const char* kvs_key, char* kvs_val);
 
     size_t init(const char* main_addr);
 
-    size_t main_server_address_reserve(char* main_addr);
+    kvs_status_t main_server_address_reserve(char* main_addr);
 
-    size_t get_count_names(const char* kvs_name);
+    kvs_status_t get_count_names(const char* kvs_name, int& count_names);
 
-    size_t finalize(void);
+    kvs_status_t finalize(void);
 
-    size_t get_keys_values_by_name(const char* kvs_name, char*** kvs_keys, char*** kvs_values);
+    kvs_status_t get_keys_values_by_name(const char* kvs_name,
+                                         char*** kvs_keys,
+                                         char*** kvs_values,
+                                         size_t& count);
 
     /*Work with KVS, new*/
 
 private:
-    size_t replace_str(char* str, int old_rank, int new_rank);
+    kvs_status_t replace_str(char* str, int old_rank, int new_rank);
 
-    void update_ranks(int* old_count, rank_list_t** origin_list, const char* kvs_name);
+    kvs_status_t update_ranks(int* old_count, std::list<int>& origin_list, const char* kvs_name);
 
-    void clean_dead_pods_info(rank_list_t* dead_up_idx);
+    kvs_status_t clean_dead_pods_info(std::list<int>& dead_up_idx);
 
-    void accept_new_ranks(shift_list_t* cur_list);
+    kvs_status_t accept_new_ranks(const std::list<shift_rank_t>& cur_list);
 
-    void update_kvs_info(int new_rank);
+    kvs_status_t update_kvs_info(int new_rank);
 
-    void move_to_new_rank(int new_rank);
+    kvs_status_t move_to_new_rank(int new_rank);
 
-    void update_my_info(shift_list_t* list);
+    kvs_status_t update_my_info(const std::list<shift_rank_t>& list);
 
-    void post_my_info(void);
+    kvs_status_t post_my_info(void);
 
-    size_t get_val_count(const char* name, const char* val);
+    kvs_status_t get_val_count(const char* name, const char* val, size_t& res);
 
-    size_t get_occupied_ranks_count(char* rank);
+    kvs_status_t get_occupied_ranks_count(char* rank, size_t& res);
 
-    size_t get_count_requested_ranks(char* rank);
+    kvs_status_t get_count_requested_ranks(char* rank, size_t& count_pods_with_my_rank);
 
-    void occupied_rank(char* rank);
+    kvs_status_t occupied_rank(char* rank);
 
-    void up_kvs(const char* new_kvs_name, const char* old_kvs_name);
+    kvs_status_t up_kvs(const char* new_kvs_name, const char* old_kvs_name);
     std::shared_ptr<ikvs_wrapper> k;
 };
 #endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
index 6f68a78a6..95b3807d7 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
@@ -16,32 +16,34 @@
 #pragma once
 
 #include <unistd.h>
+#include "util/pm/pmi_resizable_rt/pmi_resizable/def.h"
 
 class ikvs_wrapper {
 public:
-    virtual ~ikvs_wrapper() = default;
+    virtual ~ikvs_wrapper() noexcept(false){};
 
-    virtual size_t kvs_set_value(const char* kvs_name,
-                                 const char* kvs_key,
-                                 const char* kvs_val) = 0;
+    virtual kvs_status_t kvs_set_value(const char* kvs_name,
+                                       const char* kvs_key,
+                                       const char* kvs_val) = 0;
 
-    virtual size_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) = 0;
+    virtual kvs_status_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) = 0;
 
-    virtual size_t kvs_get_value_by_name_key(const char* kvs_name,
-                                             const char* kvs_key,
-                                             char* kvs_val) = 0;
+    virtual kvs_status_t kvs_get_value_by_name_key(const char* kvs_name,
+                                                   const char* kvs_key,
+                                                   char* kvs_val) = 0;
 
-    virtual size_t kvs_init(const char* main_addr) = 0;
+    virtual kvs_status_t kvs_init(const char* main_addr) = 0;
 
-    virtual size_t kvs_main_server_address_reserve(char* main_addr) = 0;
+    virtual kvs_status_t kvs_main_server_address_reserve(char* main_addr) = 0;
 
-    virtual size_t kvs_get_count_names(const char* kvs_name) = 0;
+    virtual kvs_status_t kvs_get_count_names(const char* kvs_name, int& count_names) = 0;
 
-    virtual size_t kvs_finalize(void) = 0;
+    virtual kvs_status_t kvs_finalize() = 0;
 
-    virtual size_t kvs_get_keys_values_by_name(const char* kvs_name,
-                                               char*** kvs_keys,
-                                               char*** kvs_values) = 0;
+    virtual kvs_status_t kvs_get_keys_values_by_name(const char* kvs_name,
+                                                     char*** kvs_keys,
+                                                     char*** kvs_values,
+                                                     size_t& count) = 0;
 
-    virtual size_t kvs_get_replica_size(void) = 0;
+    virtual kvs_status_t kvs_get_replica_size(size_t& replica_size) = 0;
 };
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
index 3a69b0434..2876bebd7 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
@@ -28,15 +28,15 @@
 #include <time.h>
 #include <unistd.h>
 
-#include "util/pm/pmi_resizable_rt/pmi_resizable/def.h"
 #include "internal_kvs.h"
 #include "internal_kvs_server.hpp"
 #include "common/log/log.hpp"
 #include "util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp"
 
-size_t internal_kvs::kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t internal_kvs::kvs_set_value(const char* kvs_name,
+                                         const char* kvs_key,
+                                         const char* kvs_val) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_PUT;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -49,12 +49,13 @@ size_t internal_kvs::kvs_set_value(const char* kvs_name, const char* kvs_key, co
              client_memory_mutex,
              "client: put_key_value");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_set_size(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t internal_kvs::kvs_set_size(const char* kvs_name,
+                                        const char* kvs_key,
+                                        const char* kvs_val) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_SET_SIZE;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -67,14 +68,13 @@ size_t internal_kvs::kvs_set_size(const char* kvs_name, const char* kvs_key, con
              client_memory_mutex,
              "client: set_size");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_barrier_register(const char* kvs_name,
-                                          const char* kvs_key,
-                                          const char* kvs_val) {
+kvs_status_t internal_kvs::kvs_barrier_register(const char* kvs_name,
+                                                const char* kvs_key,
+                                                const char* kvs_val) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_BARRIER_REGISTER;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -87,13 +87,15 @@ size_t internal_kvs::kvs_barrier_register(const char* kvs_name,
              client_memory_mutex,
              "client: barrier_register");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-void internal_kvs::kvs_barrier(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t internal_kvs::kvs_barrier(const char* kvs_name,
+                                       const char* kvs_key,
+                                       const char* kvs_val) {
     kvs_request_t request;
     int is_done;
-    memset(&request, 0, sizeof(kvs_request_t));
+
     request.mode = AM_BARRIER;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -108,11 +110,11 @@ void internal_kvs::kvs_barrier(const char* kvs_name, const char* kvs_key, const
              sizeof(is_done),
              client_memory_mutex,
              "client: barrier read data");
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_key) {
+kvs_status_t internal_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_key) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_REMOVE;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -124,12 +126,11 @@ size_t internal_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_k
              client_memory_mutex,
              "client: remove_key");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_register(const char* kvs_name, const char* kvs_key, char* kvs_val) {
+kvs_status_t internal_kvs::kvs_register(const char* kvs_name, const char* kvs_key, char* kvs_val) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_INTERNAL_REGISTER;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
     kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
@@ -147,14 +148,13 @@ size_t internal_kvs::kvs_register(const char* kvs_name, const char* kvs_key, cha
              "client: register read data");
     kvs_str_copy(kvs_val, request.val, MAX_KVS_VAL_LENGTH);
 
-    return strlen(kvs_val);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_get_value_by_name_key(const char* kvs_name,
-                                               const char* kvs_key,
-                                               char* kvs_val) {
+kvs_status_t internal_kvs::kvs_get_value_by_name_key(const char* kvs_name,
+                                                     const char* kvs_key,
+                                                     char* kvs_val) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_VAL;
     size_t is_exist = 0;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
@@ -181,42 +181,37 @@ size_t internal_kvs::kvs_get_value_by_name_key(const char* kvs_name,
         kvs_str_copy(kvs_val, request.val, MAX_KVS_VAL_LENGTH);
     }
 
-    return strlen(kvs_val);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_get_count_names(const char* kvs_name) {
-    size_t count_names = 0;
+kvs_status_t internal_kvs::kvs_get_count_names(const char* kvs_name, int& count_names) {
+    count_names = 0;
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_COUNT;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
 
-    DO_RW_OP(write,
-             client_op_sock,
-             &request,
-             sizeof(kvs_request_t),
-             client_memory_mutex,
-             "client: get_count");
+    DO_RW_OP(
+        write, client_op_sock, &request, sizeof(request), client_memory_mutex, "client: get_count");
 
     DO_RW_OP(read,
              client_op_sock,
              &count_names,
-             sizeof(size_t),
+             sizeof(count_names),
              client_memory_mutex,
              "client: get_count read data");
 
-    return count_names;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
-                                                 char*** kvs_keys,
-                                                 char*** kvs_values) {
-    size_t count = 0;
+kvs_status_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
+                                                       char*** kvs_keys,
+                                                       char*** kvs_values,
+                                                       size_t& count) {
+    count = 0;
     size_t i;
     kvs_request_t request;
-    kvs_request_t* answers;
+    std::vector<kvs_request_t> answers;
 
-    memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_KEYS_VALUES;
     kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
 
@@ -235,12 +230,12 @@ size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
              "client: get_keys_values read size");
 
     if (count == 0)
-        return count;
+        return KVS_STATUS_SUCCESS;
 
-    answers = (kvs_request_t*)calloc(count, sizeof(kvs_request_t));
+    answers.resize(count);
     DO_RW_OP(read,
              client_op_sock,
-             answers,
+             answers.data(),
              sizeof(kvs_request_t) * count,
              client_memory_mutex,
              "client: get_keys_values read data");
@@ -251,10 +246,14 @@ size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
         *kvs_keys = (char**)calloc(count, sizeof(char*));
         if ((*kvs_keys) == nullptr) {
             LOG_ERROR("Memory allocation failed");
-            exit(1);
+            return KVS_STATUS_FAILURE;
         }
         for (i = 0; i < count; i++) {
             (*kvs_keys)[i] = (char*)calloc(MAX_KVS_KEY_LENGTH, sizeof(char));
+            if ((*kvs_keys)[i] == nullptr) {
+                LOG_ERROR("Memory allocation failed");
+                return KVS_STATUS_FAILURE;
+            }
             kvs_str_copy((*kvs_keys)[i], answers[i].key, MAX_KVS_KEY_LENGTH);
         }
     }
@@ -265,27 +264,28 @@ size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
         *kvs_values = (char**)calloc(count, sizeof(char*));
         if ((*kvs_values) == nullptr) {
             LOG_ERROR("Memory allocation failed");
-            exit(1);
+            return KVS_STATUS_FAILURE;
         }
         for (i = 0; i < count; i++) {
             (*kvs_values)[i] = (char*)calloc(MAX_KVS_VAL_LENGTH, sizeof(char));
+            if ((*kvs_values)[i] == nullptr) {
+                LOG_ERROR("Memory allocation failed");
+                return KVS_STATUS_FAILURE;
+            }
             kvs_str_copy((*kvs_values)[i], answers[i].val, MAX_KVS_VAL_LENGTH);
         }
     }
 
-    free(answers);
-
-    return count;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_get_replica_size(void) {
-    size_t replica_size = 0;
+kvs_status_t internal_kvs::kvs_get_replica_size(size_t& replica_size) {
+    replica_size = 0;
     if (ip_getting_mode == IGT_K8S) {
-        replica_size = request_k8s_get_replica_size();
+        return request_k8s_get_replica_size(replica_size);
     }
     else {
         kvs_request_t request;
-        memset(&request, 0, sizeof(kvs_request_t));
         request.mode = AM_GET_REPLICA;
 
         DO_RW_OP(write,
@@ -302,24 +302,25 @@ size_t internal_kvs::kvs_get_replica_size(void) {
                  client_memory_mutex,
                  "client: get_replica read size");
     }
-    return replica_size;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::init_main_server_by_k8s() {
+kvs_status_t internal_kvs::init_main_server_by_k8s() {
     char port_str[MAX_KVS_VAL_LENGTH];
-    request_k8s_kvs_init();
+    KVS_CHECK_STATUS(request_k8s_kvs_init(), "failed to init k8s kvs");
 
     SET_STR(port_str, INT_STR_SIZE, "%d", local_server_address->get_sin_port());
 
-    request_k8s_kvs_get_master(local_host_ip, main_host_ip, port_str);
+    KVS_CHECK_STATUS(request_k8s_kvs_get_master(local_host_ip, main_host_ip, port_str),
+                     "failed to get port");
 
-    main_port = safe_strtol(port_str, nullptr, 10);
+    KVS_CHECK_STATUS(safe_strtol(port_str, main_port), "failed to convert main_port");
     main_server_address->set_sin_port(main_port);
-    main_server_address->set_sin_addr(main_host_ip);
-    return 0;
+    KVS_CHECK_STATUS(main_server_address->set_sin_addr(main_host_ip), "failed to set main_ip");
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::init_main_server_by_env() {
+kvs_status_t internal_kvs::init_main_server_by_env() {
     char* port = nullptr;
 
     const char* tmp_host_ip = (!server_address.empty()) ? server_address.c_str()
@@ -327,7 +328,7 @@ size_t internal_kvs::init_main_server_by_env() {
 
     if (tmp_host_ip == nullptr) {
         LOG_ERROR("specify ", CCL_KVS_IP_PORT_ENV);
-        return 1;
+        return KVS_STATUS_FAILURE;
     }
 
     memset(main_host_ip, 0, CCL_IP_LEN);
@@ -335,25 +336,25 @@ size_t internal_kvs::init_main_server_by_env() {
     if ((port = strstr(main_host_ip, "_")) == nullptr) {
         if ((port = strstr(main_host_ip, ":")) == nullptr) {
             LOG_ERROR("set ", CCL_KVS_IP_PORT_ENV, " in format <ip>_<port>\n");
-            return 1;
+            return KVS_STATUS_FAILURE;
         }
     }
     port[0] = '\0';
     port++;
 
-    main_port = safe_strtol(port, nullptr, 10);
+    KVS_CHECK_STATUS(safe_strtol(port, main_port), "failed to convert main_port");
     main_server_address->set_sin_port(main_port);
-    main_server_address->set_sin_addr(main_host_ip);
-    return 0;
+    KVS_CHECK_STATUS(main_server_address->set_sin_addr(main_host_ip), "failed to set main_ip");
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::init_main_server_by_string(const char* main_addr) {
+kvs_status_t internal_kvs::init_main_server_by_string(const char* main_addr) {
     char* port = nullptr;
-    local_server_address->set_sin_addr(local_host_ip);
+    KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip), "failed to set main_ip");
 
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        LOG_ERROR("init_main_server_by_string: server_listen_sock init");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_listen_sock init");
+        return KVS_STATUS_FAILURE;
     }
 
     size_t sin_port = local_server_address->get_sin_port();
@@ -369,28 +370,28 @@ size_t internal_kvs::init_main_server_by_string(const char* main_addr) {
 
     if ((port = strstr(main_host_ip, "_")) == nullptr) {
         if ((port = strstr(main_host_ip, ":")) == nullptr) {
-            LOG_ERROR(
-                "init_main_server_by_string: set ", CCL_KVS_IP_PORT_ENV, " in format <ip>_<port>");
-            return 1;
+            LOG_ERROR("set ", CCL_KVS_IP_PORT_ENV, " in format <ip>_<port>");
+            return KVS_STATUS_FAILURE;
         }
     }
     port[0] = '\0';
     port++;
 
-    main_port = safe_strtol(port, nullptr, 10);
+    KVS_CHECK_STATUS(safe_strtol(port, main_port), "failed to convert main_port");
     main_server_address->set_sin_port(main_port);
-    main_server_address->set_sin_addr(main_host_ip);
-    return 0;
+    KVS_CHECK_STATUS(main_server_address->set_sin_addr(main_host_ip), "failed to set main_ip");
+
+    return KVS_STATUS_SUCCESS;
 }
 
-int internal_kvs::fill_local_host_ip() {
+kvs_status_t internal_kvs::fill_local_host_ip() {
     struct ifaddrs *ifaddr, *ifa;
     int family = AF_UNSPEC;
     char local_ip[CCL_IP_LEN];
     bool is_supported_iface = false;
     if (getifaddrs(&ifaddr) < 0) {
-        LOG_ERROR("fill_local_host_ip: can not get host IP");
-        return -1;
+        LOG_ERROR("can not get host IP");
+        return KVS_STATUS_FAILURE;
     }
 
     const char iface_name[] = "lo";
@@ -421,10 +422,10 @@ int internal_kvs::fill_local_host_ip() {
                     0,
                     NI_NUMERICHOST);
                 if (res != 0) {
-                    std::string s("fill_local_host_ip: getnameinfo error > ");
+                    std::string s("getnameinfo error > ");
                     s.append(gai_strerror(res));
                     LOG_ERROR(s.c_str());
-                    return -1;
+                    return KVS_STATUS_FAILURE;
                 }
 
                 local_host_ips.push_back(local_ip);
@@ -443,16 +444,18 @@ int internal_kvs::fill_local_host_ip() {
         }
     }
     if (local_host_ips.empty()) {
-        LOG_ERROR("fill_local_host_ip: can't find interface ",
-                  iface_name_env ? iface_name_env : "",
-                  " to get host IP");
-        return -1;
+        LOG_ERROR("can't find interface ", iface_name_env ? iface_name_env : "", " to get host IP");
+        return KVS_STATUS_FAILURE;
     }
 
     memset(local_host_ip, 0, CCL_IP_LEN);
 
     char* kvs_prefer_ipv6 = std::getenv(CCL_KVS_PREFER_IPV6_ENV.c_str());
-    size_t is_kvs_prefer_ipv6 = kvs_prefer_ipv6 ? safe_strtol(kvs_prefer_ipv6, nullptr, 10) : 0;
+    size_t is_kvs_prefer_ipv6 = 0;
+    if (kvs_prefer_ipv6) {
+        KVS_CHECK_STATUS(safe_strtol(kvs_prefer_ipv6, is_kvs_prefer_ipv6),
+                         "failed to set prefer_ip6");
+    }
 
     if (is_kvs_prefer_ipv6) {
         if (!local_host_ipv6s.empty()) {
@@ -480,25 +483,22 @@ int internal_kvs::fill_local_host_ip() {
     LOG_DEBUG("use ", address_family == AF_INET ? "ipv4" : "ipv6", ": ", local_host_ip);
 
     freeifaddrs(ifaddr);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
+kvs_status_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
     if (!server_address.empty())
-        return 0;
+        return KVS_STATUS_SUCCESS;
 
-    if (fill_local_host_ip() < 0) {
-        LOG_ERROR("reserve_main_address: failed to get local host IP");
-        exit(EXIT_FAILURE);
-    }
+    KVS_CHECK_STATUS(fill_local_host_ip(), "failed to get local host IP");
 
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        LOG_ERROR("reserve_main_address: server_listen_sock init");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_listen_sock init");
+        return KVS_STATUS_FAILURE;
     }
 
-    main_server_address->set_sin_addr(local_host_ip);
-    local_server_address->set_sin_addr(local_host_ip);
+    KVS_CHECK_STATUS(main_server_address->set_sin_addr(local_host_ip), "failed to set local_ip");
+    KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip), "failed to set local_ip");
     size_t sin_port = main_server_address->get_sin_port();
 
     while (bind(server_listen_sock,
@@ -516,17 +516,14 @@ size_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
              "_%d",
              main_server_address->get_sin_port());
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::init_main_server_address(const char* main_addr) {
+kvs_status_t internal_kvs::init_main_server_address(const char* main_addr) {
     char* ip_getting_type = std::getenv(CCL_KVS_IP_EXCHANGE_ENV.c_str());
 
     if (local_host_ips.empty()) {
-        if (fill_local_host_ip() < 0) {
-            LOG_ERROR("init_main_server_address: failed to get local host ip");
-            exit(EXIT_FAILURE);
-        }
+        KVS_CHECK_STATUS(fill_local_host_ip(), "failed to get local host ip");
     }
 
     if (ip_getting_type) {
@@ -538,28 +535,29 @@ size_t internal_kvs::init_main_server_address(const char* main_addr) {
         }
         else {
             LOG_ERROR("unknown ", CCL_KVS_IP_EXCHANGE_ENV, ": ", ip_getting_type);
-            return 1;
+            return KVS_STATUS_FAILURE;
         }
     }
 
     if (server_address.empty()) {
         if (main_addr != NULL) {
             ip_getting_mode = IGT_ENV;
-            if (server_listen_sock == 0)
-                init_main_server_by_string(main_addr);
-            return 0;
+            if (server_listen_sock == 0) {
+                KVS_CHECK_STATUS(init_main_server_by_string(main_addr),
+                                 "failed to init main server");
+            }
+            return KVS_STATUS_SUCCESS;
         }
     }
     else {
         ip_getting_mode = IGT_ENV;
     }
 
-    local_server_address->set_sin_addr(local_host_ip);
+    KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip), "failed to set local_ip");
 
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        ;
-        LOG_ERROR("init_main_server_address: server_listen_sock init");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_listen_sock init");
+        return KVS_STATUS_FAILURE;
     }
 
     switch (ip_getting_mode) {
@@ -576,11 +574,9 @@ size_t internal_kvs::init_main_server_address(const char* main_addr) {
             return init_main_server_by_k8s();
         }
         case IGT_ENV: {
-            int res = init_main_server_by_env();
             int is_master_node = 0;
 
-            if (res)
-                return res;
+            KVS_CHECK_STATUS(init_main_server_by_env(), "failed to init_main_server_by_env");
 
             if (strstr(local_host_ip, main_host_ip)) {
                 is_master_node = 1;
@@ -592,7 +588,8 @@ size_t internal_kvs::init_main_server_address(const char* main_addr) {
                     is_master_node = 1;
                     memset(local_host_ip, 0, CCL_IP_LEN);
                     kvs_str_copy_known_sizes(local_host_ip, main_host_ip, CCL_IP_LEN);
-                    local_server_address->set_sin_addr(local_host_ip);
+                    KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip),
+                                     "get sin add failed");
                 }
             }
             if (is_master_node) {
@@ -622,16 +619,16 @@ size_t internal_kvs::init_main_server_address(const char* main_addr) {
                 }
             }
 
-            return res;
+            return KVS_STATUS_SUCCESS;
         }
         default: {
             LOG_ERROR("unknown ", CCL_KVS_IP_EXCHANGE_ENV);
-            return 1;
+            return KVS_STATUS_FAILURE;
         }
     }
 }
 
-size_t internal_kvs::kvs_init(const char* main_addr) {
+kvs_status_t internal_kvs::kvs_init(const char* main_addr) {
     int err;
     socklen_t len = 0;
     std::shared_ptr<isockaddr> addr;
@@ -639,32 +636,32 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     time_t start_time;
     time_t connection_time = 0;
 
-    if (init_main_server_address(main_addr)) {
-        LOG_ERROR("kvs_init: init main server address error");
+    if (init_main_server_address(main_addr) != KVS_STATUS_SUCCESS) {
+        LOG_ERROR("init main server address error");
         close(client_op_sock);
         close(server_control_sock);
         client_op_sock = 0;
         server_control_sock = 0;
-        return 1;
+        return KVS_STATUS_FAILURE;
     }
 
     if (address_family == AF_INET) {
         addr = std::shared_ptr<isockaddr>(new sockaddr_v4());
-        addr->set_sin_addr("127.0.0.1");
+        KVS_CHECK_STATUS(addr->set_sin_addr("127.0.0.1"), "failed to set sin_addr(\"127.0.0.1\"");
     }
     else {
         addr = std::shared_ptr<isockaddr>(new sockaddr_v6());
-        addr->set_sin_addr("::1");
+        KVS_CHECK_STATUS(addr->set_sin_addr("::1"), "failed to set sin_addr(\"::1\"");
     }
 
     if ((client_op_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        LOG_ERROR("kvs_init: client_op_sock init");
-        return 1;
+        LOG_ERROR("client_op_sock init");
+        return KVS_STATUS_FAILURE;
     }
 
     if ((server_control_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        LOG_ERROR("kvs_init: server_control_sock init");
-        return 1;
+        LOG_ERROR("server_control_sock init");
+        return KVS_STATUS_FAILURE;
     }
 
     size_t sin_port = addr->get_sin_port();
@@ -674,8 +671,8 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     }
 
     if (listen(server_control_sock, 1) < 0) {
-        LOG_ERROR("kvs_init: server_control_sock listen");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_control_sock listen");
+        return KVS_STATUS_FAILURE;
     }
 
     getsockname(server_control_sock, addr->get_sock_addr_ptr(), &len);
@@ -684,13 +681,13 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     args.sock_listener = server_listen_sock;
     err = pthread_create(&kvs_thread, nullptr, kvs_server_init, &args);
     if (err) {
-        LOG_ERROR("kvs_init: failed to create kvs server thread, pthread_create returns ", err);
-        return 1;
+        LOG_ERROR("failed to create kvs server thread, pthread_create returns ", err);
+        return KVS_STATUS_FAILURE;
     }
 
     if ((client_control_sock = accept(server_control_sock, nullptr, nullptr)) < 0) {
-        LOG_ERROR("kvs_init: server_control_sock accept");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_control_sock accept");
+        return KVS_STATUS_FAILURE;
     }
 
     /* Wait connection to master */
@@ -702,11 +699,8 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     } while ((err < 0) && (connection_time < CONNECTION_TIMEOUT));
 
     if (connection_time >= CONNECTION_TIMEOUT) {
-        LOG_ERROR("kvs_init: connection error: timeout limit (",
-                  connection_time,
-                  " > ",
-                  CONNECTION_TIMEOUT);
-        exit(EXIT_FAILURE);
+        LOG_ERROR("connection time (", connection_time, ") >= limit (", CONNECTION_TIMEOUT, ")");
+        return KVS_STATUS_FAILURE;
     }
 
     if (strstr(main_host_ip, local_host_ip) && local_port == main_port) {
@@ -714,13 +708,11 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     }
     is_inited = true;
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t internal_kvs::kvs_finalize(void) {
+kvs_status_t internal_kvs::kvs_finalize(void) {
     kvs_request_t request;
-    memset(&request, 0, sizeof(kvs_request_t));
-
     close(client_op_sock);
     client_op_sock = 0;
     if (kvs_thread != 0) {
@@ -743,7 +735,8 @@ size_t internal_kvs::kvs_finalize(void) {
 
         err = pthread_join(kvs_thread, &exit_code);
         if (err) {
-            LOG_ERROR("kvs_finalize: failed to stop kvs server thread, pthread_join returns ", err);
+            LOG_ERROR("failed to stop kvs server thread, pthread_join returns ", err);
+            return KVS_STATUS_FAILURE;
         }
 
         kvs_thread = 0;
@@ -755,19 +748,21 @@ size_t internal_kvs::kvs_finalize(void) {
         server_control_sock = 0;
     }
 
-    if (ip_getting_mode == IGT_K8S)
-        request_k8s_kvs_finalize(is_master);
+    if (ip_getting_mode == IGT_K8S) {
+        KVS_CHECK_STATUS(request_k8s_kvs_finalize(is_master), "failed to finaluze k8s kvs");
+    }
     is_inited = false;
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
 internal_kvs::~internal_kvs() {
-    if (is_inited)
-        kvs_finalize();
+    if (is_inited) {
+        CCL_THROW_IF_NOT(kvs_finalize() == KVS_STATUS_SUCCESS, "failed to finalize kvs");
+    }
 }
 
-void sockaddr_v4::set_sin_addr(const char* src) {
+kvs_status_t sockaddr_v4::set_sin_addr(const char* src) {
     int ret = inet_pton(addr.sin_family, src, &(addr.sin_addr));
     if (ret <= 0) {
         if (ret == 0) {
@@ -782,17 +777,19 @@ void sockaddr_v4::set_sin_addr(const char* src) {
                       ", error: ",
                       strerror(errno));
         }
-        exit(1);
+        return KVS_STATUS_FAILURE;
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-void sockaddr_v6::set_sin_addr(const char* src) {
+kvs_status_t sockaddr_v6::set_sin_addr(const char* src) {
     char src_copy[internal_kvs::CCL_IP_LEN] = { 0 };
     kvs_str_copy(src_copy, src, internal_kvs::CCL_IP_LEN);
 
     char* scope_id_ptr = nullptr;
     if ((scope_id_ptr = strchr(src_copy, internal_kvs::SCOPE_ID_DELIM))) {
-        addr.sin6_scope_id = safe_strtol(scope_id_ptr + 1, nullptr, 10);
+        KVS_CHECK_STATUS(safe_strtol(scope_id_ptr + 1, addr.sin6_scope_id),
+                         "failed to ged sin6_id");
         *scope_id_ptr = '\0';
     }
 
@@ -812,8 +809,9 @@ void sockaddr_v6::set_sin_addr(const char* src) {
                       ", error: ",
                       strerror(errno));
         }
-        exit(1);
+        return KVS_STATUS_FAILURE;
     }
 
-    LOG_DEBUG("addr: ", src_copy, ", scope_id: ", addr.sin6_scope_id);
+    LOG_DEBUG("", src_copy, ", scope_id: ", addr.sin6_scope_id);
+    return KVS_STATUS_SUCCESS;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
index 7460426d2..01a56fffa 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
@@ -28,7 +28,7 @@ class isockaddr {
     virtual in_port_t get_sin_port() = 0;
     virtual void set_sin_port(in_port_t) = 0;
     virtual const void* get_sin_addr_ptr() = 0;
-    virtual void set_sin_addr(const char*) = 0;
+    virtual kvs_status_t set_sin_addr(const char*) = 0;
     virtual struct sockaddr* get_sock_addr_ptr() = 0;
     virtual sa_family_t sin_family() = 0;
     virtual size_t size() = 0;
@@ -40,35 +40,40 @@ class isockaddr {
 
 class internal_kvs final : public ikvs_wrapper {
 public:
-    size_t kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) override;
+    kvs_status_t kvs_set_value(const char* kvs_name,
+                               const char* kvs_key,
+                               const char* kvs_val) override;
 
-    size_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) override;
+    kvs_status_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) override;
 
-    size_t kvs_get_value_by_name_key(const char* kvs_name,
-                                     const char* kvs_key,
-                                     char* kvs_val) override;
+    kvs_status_t kvs_get_value_by_name_key(const char* kvs_name,
+                                           const char* kvs_key,
+                                           char* kvs_val) override;
 
-    size_t kvs_register(const char* kvs_name, const char* kvs_key, char* kvs_val);
+    kvs_status_t kvs_register(const char* kvs_name, const char* kvs_key, char* kvs_val);
 
-    size_t kvs_set_size(const char* kvs_name, const char* kvs_key, const char* kvs_val);
+    kvs_status_t kvs_set_size(const char* kvs_name, const char* kvs_key, const char* kvs_val);
 
-    size_t kvs_barrier_register(const char* kvs_name, const char* kvs_key, const char* kvs_val);
+    kvs_status_t kvs_barrier_register(const char* kvs_name,
+                                      const char* kvs_key,
+                                      const char* kvs_val);
 
-    void kvs_barrier(const char* kvs_name, const char* kvs_key, const char* kvs_val);
+    kvs_status_t kvs_barrier(const char* kvs_name, const char* kvs_key, const char* kvs_val);
 
-    size_t kvs_init(const char* main_addr) override;
+    kvs_status_t kvs_init(const char* main_addr) override;
 
-    size_t kvs_main_server_address_reserve(char* main_addr) override;
+    kvs_status_t kvs_main_server_address_reserve(char* main_addr) override;
 
-    size_t kvs_get_count_names(const char* kvs_name) override;
+    kvs_status_t kvs_get_count_names(const char* kvs_name, int& count_names) override;
 
-    size_t kvs_finalize() override;
+    kvs_status_t kvs_finalize() override;
 
-    size_t kvs_get_keys_values_by_name(const char* kvs_name,
-                                       char*** kvs_keys,
-                                       char*** kvs_values) override;
+    kvs_status_t kvs_get_keys_values_by_name(const char* kvs_name,
+                                             char*** kvs_keys,
+                                             char*** kvs_values,
+                                             size_t& count) override;
 
-    size_t kvs_get_replica_size() override;
+    kvs_status_t kvs_get_replica_size(size_t& replica_size) override;
 
     ~internal_kvs() override;
 
@@ -80,11 +85,11 @@ class internal_kvs final : public ikvs_wrapper {
     static const char SCOPE_ID_DELIM = '%';
 
 private:
-    size_t init_main_server_by_string(const char* main_addr);
-    size_t init_main_server_by_env();
-    size_t init_main_server_by_k8s();
-    size_t init_main_server_address(const char* main_addr);
-    int fill_local_host_ip();
+    kvs_status_t init_main_server_by_string(const char* main_addr);
+    kvs_status_t init_main_server_by_env();
+    kvs_status_t init_main_server_by_k8s();
+    kvs_status_t init_main_server_address(const char* main_addr);
+    kvs_status_t fill_local_host_ip();
     bool is_inited{ false };
 
     pthread_t kvs_thread = 0;
@@ -151,7 +156,7 @@ class sockaddr_v4 : public isockaddr {
     const void* get_sin_addr_ptr() override {
         return &(addr.sin_addr);
     }
-    void set_sin_addr(const char* src) override;
+    kvs_status_t set_sin_addr(const char* src) override;
     sa_family_t sin_family() override {
         return addr.sin_family;
     }
@@ -180,7 +185,7 @@ class sockaddr_v6 : public isockaddr {
     const void* get_sin_addr_ptr() override {
         return &(addr.sin6_addr);
     }
-    void set_sin_addr(const char* src) override;
+    kvs_status_t set_sin_addr(const char* src) override;
     struct sockaddr* get_sock_addr_ptr() override {
         return (struct sockaddr*)&addr;
     }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
index 7a4cba47f..6ac605e6a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
@@ -32,10 +32,10 @@
 class server {
 public:
     server() = default;
-    void run(void*);
-    bool check_finalize();
-    void make_client_request(int& socket);
-    void try_to_connect_new();
+    kvs_status_t run(void*);
+    kvs_status_t check_finalize(bool& to_finalize);
+    kvs_status_t make_client_request(int& socket);
+    kvs_status_t try_to_connect_new();
 
 private:
     struct clients_info {
@@ -82,7 +82,7 @@ class server {
     sa_family_t address_family{ AF_UNSPEC };
 };
 
-void server::try_to_connect_new() {
+kvs_status_t server::try_to_connect_new() {
     if (poll_fds[FDI_LISTENER].revents != 0) {
         std::shared_ptr<isockaddr> addr;
 
@@ -98,8 +98,8 @@ void server::try_to_connect_new() {
         if ((new_socket = accept(poll_fds[FDI_LISTENER].fd,
                                  addr->get_sock_addr_ptr(),
                                  (socklen_t*)&peer_addr_size)) < 0) {
-            perror("server: server_listen_sock accept");
-            exit(EXIT_FAILURE);
+            LOG_ERROR("server_listen_sock accept, %s", strerror(errno));
+            return KVS_STATUS_FAILURE;
         }
         for (size_t i = FDI_LAST; i < poll_fds.size(); i++) {
             if (poll_fds[i].fd == free_socket) {
@@ -117,16 +117,17 @@ void server::try_to_connect_new() {
             }
         }
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-void server::make_client_request(int& socket) {
+kvs_status_t server::make_client_request(int& socket) {
     DO_RW_OP_1(
         read, socket, &request, sizeof(kvs_request_t), ret, "server: get command from client");
     if (ret == 0) {
         close(socket);
         socket = free_socket;
         client_count--;
-        return;
+        return KVS_STATUS_SUCCESS;
     }
 
     switch (request.mode) {
@@ -184,8 +185,10 @@ void server::make_client_request(int& socket) {
         }
         case AM_GET_REPLICA: {
             char* replica_size_str = getenv(CCL_WORLD_SIZE_ENV);
-            count = (replica_size_str != nullptr) ? safe_strtol(replica_size_str, nullptr, 10)
-                                                  : client_count;
+            count = client_count;
+            if (replica_size_str != nullptr) {
+                KVS_CHECK_STATUS(safe_strtol(replica_size_str, count), "failed to convert count");
+            }
             DO_RW_OP(
                 write, socket, &count, sizeof(size_t), server_memory_mutex, "server: get_replica");
             break;
@@ -236,8 +239,8 @@ void server::make_client_request(int& socket) {
                                           });
             if (client_it == clients.end()) {
                 // TODO: Look deeper to fix this error
-                printf("Server error: Unregister Barrier request!");
-                exit(1);
+                LOG_ERROR("Server error: Unregister Barrier request!");
+                return KVS_STATUS_FAILURE;
             }
             auto client_inf = client_it->get();
             client_inf->in_barrier = true;
@@ -275,9 +278,14 @@ void server::make_client_request(int& socket) {
             else {
                 local_size[0] = '\0';
                 local_size++;
-                barrier.local_size += safe_strtol(local_size, nullptr, 10);
+                size_t local_size_tmp;
+
+                KVS_CHECK_STATUS(safe_strtol(local_size, local_size_tmp),
+                                 "failed to convert local_size");
+                barrier.local_size += local_size_tmp;
             }
-            barrier.global_size = safe_strtol(glob_size, nullptr, 10);
+            KVS_CHECK_STATUS(safe_strtol(glob_size, barrier.global_size),
+                             "failed to convert global_size");
 
             barrier.clients.push_back(
                 std::shared_ptr<clients_info>(new clients_info(socket, false)));
@@ -285,7 +293,8 @@ void server::make_client_request(int& socket) {
         }
         case AM_SET_SIZE: {
             char* glob_size = request.val;
-            communicators[request.key].global_size = safe_strtol(glob_size, nullptr, 10);
+            KVS_CHECK_STATUS(safe_strtol(glob_size, communicators[request.key].global_size),
+                             "failed to convert global_size");
 
             break;
         }
@@ -301,7 +310,9 @@ void server::make_client_request(int& socket) {
             char* thread_id = strstr(proc_id, "_");
             thread_id[0] = '\0';
             thread_id++;
-            size_t rank_count = safe_strtol(rank_count_str, nullptr, 10);
+            size_t rank_count;
+            KVS_CHECK_STATUS(safe_strtol(rank_count_str, rank_count),
+                             "failed to convert rank_count");
             communicators[request.key].local_size += rank_count;
             socket_info sock_info{ socket, proc_id, { rank, rank_count, thread_id } };
             communicator.processes[proc_id].push_back(sock_info.process_info);
@@ -351,15 +362,16 @@ void server::make_client_request(int& socket) {
         }
         default: {
             if (request.name[0] == '\0')
-                return;
-            printf("server: unknown request mode - %d.\n", request.mode);
-            exit(EXIT_FAILURE);
+                return KVS_STATUS_SUCCESS;
+            LOG_ERROR("unknown request mode - %d.\n", request.mode);
+            return KVS_STATUS_FAILURE;
         }
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-bool server::check_finalize() {
-    bool to_finalize = false;
+kvs_status_t server::check_finalize(bool& to_finalize) {
+    to_finalize = false;
     if (poll_fds[FDI_CONTROL].revents != 0) {
         DO_RW_OP_1(read,
                    poll_fds[FDI_CONTROL].fd,
@@ -372,15 +384,15 @@ bool server::check_finalize() {
             poll_fds[FDI_CONTROL].fd = free_socket;
         }
         if (request.mode != AM_FINALIZE) {
-            printf("server: invalid access mode for local socket\n");
-            exit(EXIT_FAILURE);
+            LOG_ERROR("invalid access mode for local socket\n");
+            return KVS_STATUS_FAILURE;
         }
         to_finalize = true;
     }
-    return to_finalize;
+    return KVS_STATUS_SUCCESS;
 }
 
-void server::run(void* args) {
+kvs_status_t server::run(void* args) {
     bool should_stop = false;
     int so_reuse = 1;
     poll_fds.resize(client_count_increase);
@@ -398,13 +410,13 @@ void server::run(void* args) {
 #endif
 
     if (listen(poll_fds[FDI_LISTENER].fd, max_client_queue_size) < 0) {
-        LOG_ERROR("server: server_listen_sock listen");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_listen_sock listen(%s)", strerror(errno));
+        return KVS_STATUS_FAILURE;
     }
 
     if ((poll_fds[FDI_CONTROL].fd = socket(address_family, SOCK_STREAM, 0)) < 0) {
-        perror("server: server_control_sock init");
-        exit(EXIT_FAILURE);
+        LOG_ERROR("server_control_sock init(%s)", strerror(errno));
+        return KVS_STATUS_FAILURE;
     }
 
     while (connect(poll_fds[FDI_CONTROL].fd,
@@ -414,8 +426,8 @@ void server::run(void* args) {
     while (!should_stop || client_count > 0) {
         if (poll(poll_fds.data(), poll_fds.size(), -1) < 0) {
             if (errno != EINTR) {
-                perror("server: poll");
-                exit(EXIT_FAILURE);
+                LOG_ERROR("poll(%s)", strerror(errno));
+                return KVS_STATUS_FAILURE;
             }
             else {
                 /* restart select */
@@ -425,12 +437,12 @@ void server::run(void* args) {
 
         for (size_t i = FDI_LAST; i < poll_fds.size(); i++) {
             if (poll_fds[i].fd != free_socket && poll_fds[i].revents != 0) {
-                make_client_request(poll_fds[i].fd);
+                KVS_CHECK_STATUS(make_client_request(poll_fds[i].fd), "failed to make request");
             }
         }
-        try_to_connect_new();
+        KVS_CHECK_STATUS(try_to_connect_new(), "failed to connect new");
         if (!should_stop) {
-            should_stop = check_finalize();
+            KVS_CHECK_STATUS(check_finalize(should_stop), "failed to check finalize");
         }
     }
 
@@ -455,12 +467,15 @@ void server::run(void* args) {
 
     close(poll_fds[FDI_LISTENER].fd);
     poll_fds[FDI_LISTENER].fd = free_socket;
+    return KVS_STATUS_SUCCESS;
 }
 
 void* kvs_server_init(void* args) {
     server s;
 
-    s.run(args);
+    if (s.run(args) != KVS_STATUS_SUCCESS) {
+        LOG_ERROR("failed");
+    }
 
     return nullptr;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
index 12590aa54..2c8d17cd5 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
@@ -32,10 +32,10 @@ typedef enum kvs_access_mode {
 } kvs_access_mode_t;
 
 typedef struct kvs_request {
-    kvs_access_mode_t mode;
-    char name[MAX_KVS_NAME_LENGTH];
-    char key[MAX_KVS_KEY_LENGTH];
-    char val[MAX_KVS_VAL_LENGTH];
+    kvs_access_mode_t mode{ AM_PUT };
+    char name[MAX_KVS_NAME_LENGTH]{};
+    char key[MAX_KVS_KEY_LENGTH]{};
+    char val[MAX_KVS_VAL_LENGTH]{};
 } kvs_request_t;
 
 typedef struct server_args {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
index e648f1fbc..ff039a5f3 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
@@ -20,70 +20,80 @@
 
 users_kvs::users_kvs(std::shared_ptr<ccl::kvs_interface> kvs) : kvs(kvs) {}
 
-size_t users_kvs::kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t users_kvs::kvs_set_value(const char* kvs_name,
+                                      const char* kvs_key,
+                                      const char* kvs_val) {
     ccl::string_class name(kvs_name), key(kvs_key);
     ccl::vector_class<char> vec_val(kvs_val, kvs_val + strlen(kvs_val) + 1);
     vec_val[strlen(kvs_val)] = '\0';
     kvs->set(name + key, vec_val);
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t users_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_key) {
+kvs_status_t users_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_key) {
     ccl::vector_class<char> kvs_val = { '\0' };
     ccl::string_class name(kvs_name), key(kvs_key);
     kvs->set(name + key, kvs_val);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t users_kvs::kvs_get_value_by_name_key(const char* kvs_name,
-                                            const char* kvs_key,
-                                            char* kvs_val) {
+kvs_status_t users_kvs::kvs_get_value_by_name_key(const char* kvs_name,
+                                                  const char* kvs_key,
+                                                  char* kvs_val) {
     ccl::string_class name(kvs_name), key(kvs_key);
     ccl::vector_class<char> res = kvs->get(name + key);
 
+    memset(kvs_val, 0, MAX_KVS_VAL_LENGTH);
     if (res.data())
         SET_STR(kvs_val, MAX_KVS_VAL_LENGTH, "%s", res.data());
     else
         SET_STR(kvs_val, MAX_KVS_VAL_LENGTH, "%s", "");
 
-    return strlen(kvs_val);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t users_kvs::kvs_get_count_names(const char* kvs_name) {
+kvs_status_t users_kvs::kvs_get_count_names(const char* kvs_name, int& count_names) {
     /*TODO: Unsupported*/
     (void)kvs_name;
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
 
-size_t users_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
-                                              char*** kvs_keys,
-                                              char*** kvs_values) {
+kvs_status_t users_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
+                                                    char*** kvs_keys,
+                                                    char*** kvs_values,
+                                                    size_t& count) {
     /*TODO: Unsupported*/
     (void)kvs_name;
     (void)kvs_keys;
     (void)kvs_values;
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
 
-size_t users_kvs::kvs_get_replica_size(void) {
+kvs_status_t users_kvs::kvs_get_replica_size(size_t& replica_size) {
     /*TODO: Unsupported*/
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
 
-size_t users_kvs::kvs_main_server_address_reserve(char* main_address) {
+kvs_status_t users_kvs::kvs_main_server_address_reserve(char* main_address) {
     /*TODO: Unsupported*/
     (void)main_address;
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
 
-size_t users_kvs::kvs_init(const char* main_addr) {
+kvs_status_t users_kvs::kvs_init(const char* main_addr) {
     /*TODO: Unsupported*/
     (void)main_addr;
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
 
-size_t users_kvs::kvs_finalize(void) {
+kvs_status_t users_kvs::kvs_finalize(void) {
     /*TODO: Unsupported*/
-    return 0;
+    LOG_ERROR("unsupported");
+    return KVS_STATUS_UNSUPPORTED;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
index 1d220ebff..6d180e764 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
@@ -27,27 +27,30 @@ class users_kvs final : public ikvs_wrapper {
 
     ~users_kvs() = default;
 
-    size_t kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) override;
+    kvs_status_t kvs_set_value(const char* kvs_name,
+                               const char* kvs_key,
+                               const char* kvs_val) override;
 
-    size_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) override;
+    kvs_status_t kvs_remove_name_key(const char* kvs_name, const char* kvs_key) override;
 
-    size_t kvs_get_value_by_name_key(const char* kvs_name,
-                                     const char* kvs_key,
-                                     char* kvs_val) override;
+    kvs_status_t kvs_get_value_by_name_key(const char* kvs_name,
+                                           const char* kvs_key,
+                                           char* kvs_val) override;
 
-    size_t kvs_init(const char* main_addr) override;
+    kvs_status_t kvs_init(const char* main_addr) override;
 
-    size_t kvs_main_server_address_reserve(char* main_addr) override;
+    kvs_status_t kvs_main_server_address_reserve(char* main_addr) override;
 
-    size_t kvs_get_count_names(const char* kvs_name) override;
+    kvs_status_t kvs_get_count_names(const char* kvs_name, int& count_names) override;
 
-    size_t kvs_finalize(void) override;
+    kvs_status_t kvs_finalize(void) override;
 
-    size_t kvs_get_keys_values_by_name(const char* kvs_name,
-                                       char*** kvs_keys,
-                                       char*** kvs_values) override;
+    kvs_status_t kvs_get_keys_values_by_name(const char* kvs_name,
+                                             char*** kvs_keys,
+                                             char*** kvs_values,
+                                             size_t& count) override;
 
-    size_t kvs_get_replica_size(void) override;
+    kvs_status_t kvs_get_replica_size(size_t& replica_size) override;
 
 private:
     std::shared_ptr<ccl::kvs_interface> kvs;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
index 6f1ea920b..cb2e4d157 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
@@ -29,11 +29,6 @@
 
 #define LISTENER_TIMEOUT 5
 
-enum return_status {
-    get_new = 0,
-    timeout = 1,
-};
-
 static int sock_sender;
 static size_t num_listeners;
 static int sock_listener = -1;
@@ -45,10 +40,10 @@ void pmi_listener::set_applied_count(int count) {
     num_changes -= count;
 }
 
-int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
+kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     FILE* fp;
     size_t i, j;
-    int res = 0;
+    kvs_status_t res = KVS_STATUS_SUCCESS;
     size_t glob_num_listeners;
     char** sock_addr_str = NULL;
     char** hosts_names_str = NULL;
@@ -56,8 +51,8 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     char* point_to_space;
 
     if ((fp = popen(GET_IP_CMD, READ_ONLY)) == NULL) {
-        printf("Can't get host IP\n");
-        exit(1);
+        LOG_ERROR("Can't get host IP");
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(my_ip, MAX_KVS_VAL_LENGTH, fp), my_ip);
     pclose(fp);
@@ -66,7 +61,9 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     if ((point_to_space = strstr(my_ip, " ")) != NULL)
         point_to_space[0] = NULL_CHAR;
 
-    glob_num_listeners = h->get_keys_values_by_name(KVS_LISTENER, &hosts_names_str, &sock_addr_str);
+    KVS_CHECK_STATUS(h->get_keys_values_by_name(
+                         KVS_LISTENER, &hosts_names_str, &sock_addr_str, glob_num_listeners),
+                     "failed to get sock info");
     num_listeners = glob_num_listeners;
 
     for (i = 0; i < num_listeners; i++) {
@@ -77,13 +74,13 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     }
 
     if (num_listeners == 0) {
-        res = 0;
+        res = KVS_STATUS_SUCCESS;
         goto exit;
     }
 
     if ((sock_sender = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
-        printf("\n Socket creation error \n");
-        res = -1;
+        LOG_ERROR("Socket creation error");
+        res = KVS_STATUS_FAILURE;
         goto exit;
     }
 
@@ -93,8 +90,8 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
 
     server_addresses = (struct sockaddr_in*)malloc((num_listeners) * sizeof(struct sockaddr_in));
     if (server_addresses == NULL) {
-        printf("\nmemory allocation failed \n");
-        res = -1;
+        LOG_ERROR("nmemory allocation failed");
+        res = KVS_STATUS_FAILURE;
         goto exit;
     }
 
@@ -102,8 +99,8 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     for (i = 0, j = 0; i < num_listeners; i++, j++) {
         char* point_to_port = strstr(sock_addr_str[j], "_");
         if (point_to_port == NULL) {
-            printf("\nlistener: Wrong address_port record: %s\n", sock_addr_str[j]);
-            res = -1;
+            LOG_ERROR("Wrong address_port record: %s", sock_addr_str[j]);
+            res = KVS_STATUS_FAILURE;
             goto exit;
         }
         point_to_port[0] = NULL_CHAR;
@@ -113,12 +110,16 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
             continue;
         }
 
-        server_addresses[i].sin_port = safe_strtol(point_to_port, NULL, 10);
+        if (safe_strtol(point_to_port, server_addresses[i].sin_port) != KVS_STATUS_SUCCESS) {
+            LOG_ERROR("failed to convert sin_port");
+            res = KVS_STATUS_FAILURE;
+            goto exit;
+        }
         server_addresses[i].sin_family = AF_INET;
 
         if (inet_pton(AF_INET, sock_addr_str[j], &(server_addresses[i].sin_addr)) <= 0) {
-            printf("\nlist: Invalid address/ Address not supported: %s\n", sock_addr_str[j]);
-            res = -1;
+            LOG_ERROR("Invalid address/ Address not supported: %s", sock_addr_str[j]);
+            res = KVS_STATUS_FAILURE;
             goto exit;
         }
     }
@@ -132,16 +133,17 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     return res;
 }
 
-void pmi_listener::clean_listener(std::shared_ptr<helper> h) {
-    h->remove_name_key(KVS_LISTENER, my_hostname);
+kvs_status_t pmi_listener::clean_listener(std::shared_ptr<helper> h) {
+    KVS_CHECK_STATUS(h->remove_name_key(KVS_LISTENER, my_hostname), "failed to remove host info");
     close(sock_listener);
+    return KVS_STATUS_SUCCESS;
 }
 
-void pmi_listener::send_notification(int sig, std::shared_ptr<helper> h) {
+kvs_status_t pmi_listener::send_notification(int sig, std::shared_ptr<helper> h) {
     size_t i;
     char message[INT_STR_SIZE];
 
-    collect_sock_addr(h);
+    KVS_CHECK_STATUS(collect_sock_addr(h), "failed to collect sock info");
 
     SET_STR(message, INT_STR_SIZE, "%s", "Update!");
     for (i = 0; i < num_listeners; ++i) {
@@ -152,11 +154,13 @@ void pmi_listener::send_notification(int sig, std::shared_ptr<helper> h) {
                (const struct sockaddr*)&(server_addresses[i]),
                sizeof(server_addresses[i]));
     }
-    if (sig)
-        clean_listener(h);
+    if (sig) {
+        KVS_CHECK_STATUS(clean_listener(h), "failed to clean listener");
+    }
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_listener::run_listener(std::shared_ptr<helper> h) {
+kvs_status_t pmi_listener::run_listener(std::shared_ptr<helper> h) {
     socklen_t len = 0;
     char recv_buf[INT_STR_SIZE];
     memset(recv_buf, 0, INT_STR_SIZE);
@@ -181,8 +185,10 @@ int pmi_listener::run_listener(std::shared_ptr<helper> h) {
             my_ip[strlen(my_ip) - 1] = '\0';
         if ((point_to_space = strstr(my_ip, " ")) != NULL)
             point_to_space[0] = NULL_CHAR;
-        if ((sock_listener = socket(AF_INET, SOCK_DGRAM, 0)) < 0)
-            return 1;
+        if ((sock_listener = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
+            LOG_ERROR("socket error(%s)", strerror(errno));
+            return KVS_STATUS_FAILURE;
+        }
 
         memset(&addr, 0, sizeof(addr));
 
@@ -190,14 +196,17 @@ int pmi_listener::run_listener(std::shared_ptr<helper> h) {
         addr.sin_addr.s_addr = INADDR_ANY;
         addr.sin_port = 0;
 
-        if (bind(sock_listener, (const struct sockaddr*)&addr, sizeof(addr)) < 0)
-            return 1;
+        if (bind(sock_listener, (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
+            LOG_ERROR("bind error(%s)", strerror(errno));
+            return KVS_STATUS_FAILURE;
+        }
 
         getsockname(sock_listener, (struct sockaddr*)&addr, (socklen_t*)&addr_len);
 
         SET_STR(
             addr_for_kvs, REQUEST_POSTFIX_SIZE, KVS_NAME_TEMPLATE_I, my_ip, (size_t)addr.sin_port);
-        h->set_value(KVS_LISTENER, my_hostname, addr_for_kvs);
+        KVS_CHECK_STATUS(h->set_value(KVS_LISTENER, my_hostname, addr_for_kvs),
+                         "failed to set addr info");
         if (setsockopt(sock_listener, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) < 0) {
             perror("Error");
         }
@@ -213,14 +222,15 @@ int pmi_listener::run_listener(std::shared_ptr<helper> h) {
                            &len);
         if (ret == -1) {
             if (errno == EAGAIN) {
-                return timeout;
+                return KVS_STATUS_SUCCESS;
             }
             if (errno != EINTR) {
-                printf("listner: accept error: %s\n", strerror(errno));
+                LOG_ERROR("listner: accept error: %s\n", strerror(errno));
+                return KVS_STATUS_FAILURE;
             }
         }
         num_changes++;
     }
 
-    return get_new;
+    return KVS_STATUS_SUCCESS;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
index e845ea50d..7616dafbb 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
@@ -20,14 +20,14 @@
 
 class pmi_listener {
 public:
-    void send_notification(int sig, std::shared_ptr<helper> h);
+    kvs_status_t send_notification(int sig, std::shared_ptr<helper> h);
 
     void set_applied_count(int count);
 
-    int run_listener(std::shared_ptr<helper> h);
+    kvs_status_t run_listener(std::shared_ptr<helper> h);
 
 private:
-    int collect_sock_addr(std::shared_ptr<helper> h);
-    void clean_listener(std::shared_ptr<helper> h);
+    kvs_status_t collect_sock_addr(std::shared_ptr<helper> h);
+    kvs_status_t clean_listener(std::shared_ptr<helper> h);
 };
 #endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
deleted file mode 100644
index 712c69562..000000000
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "rank_list.hpp"
-
-void rank_list_sort(rank_list_t* list) {
-    rank_list_t* left = list;
-    rank_list_t* right;
-
-    while (left != NULL) {
-        right = left->next;
-        while (right != NULL) {
-            if (left->rank > right->rank) {
-                int tmp_i = left->rank;
-                left->rank = right->rank;
-                right->rank = tmp_i;
-            }
-            right = right->next;
-        }
-        left = left->next;
-    }
-}
-
-void rank_list_clean(rank_list_t** list) {
-    rank_list_t* cur_list = *list;
-    rank_list_t* node_to_remove;
-
-    while (cur_list != NULL) {
-        node_to_remove = cur_list;
-        cur_list = cur_list->next;
-        free(node_to_remove);
-    }
-    *list = NULL;
-}
-
-size_t rank_list_contains(rank_list_t* list, int rank) {
-    rank_list_t* cur_list = list;
-
-    while (cur_list != NULL) {
-        if (cur_list->rank == rank)
-            return 1;
-        cur_list = cur_list->next;
-    }
-    return 0;
-}
-
-void rank_list_keep_first_n(rank_list_t** origin_list, size_t n) {
-    rank_list_t* cur_node = (*origin_list);
-    rank_list_t* tmp_node = NULL;
-    size_t i;
-
-    for (i = 0; i < n; i++) {
-        tmp_node = cur_node;
-        cur_node = cur_node->next;
-    }
-
-    if (tmp_node != NULL)
-        tmp_node->next = NULL;
-
-    while (cur_node != NULL) {
-        tmp_node = cur_node;
-        cur_node = cur_node->next;
-        free(tmp_node);
-    }
-    if (n == 0)
-        (*origin_list) = NULL;
-}
-
-void rank_list_add(rank_list_t** origin_list, int rank) {
-    if ((*origin_list) == NULL) {
-        (*origin_list) = (rank_list_t*)malloc(sizeof(rank_list_t));
-        if ((*origin_list) == NULL) {
-            printf("Memory allocation failed\n");
-            return;
-        }
-        (*origin_list)->next = NULL;
-        (*origin_list)->rank = rank;
-    }
-    else {
-        rank_list_t* cur_list;
-        cur_list = (*origin_list);
-        while (cur_list->next != NULL)
-            cur_list = cur_list->next;
-        cur_list->next = (rank_list_t*)malloc(sizeof(rank_list_t));
-        cur_list = cur_list->next;
-        cur_list->next = NULL;
-        cur_list->rank = rank;
-    }
-}
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp
deleted file mode 100644
index 064e244d6..000000000
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#ifndef INT_LIST_H_INCLUDED
-#define INT_LIST_H_INCLUDED
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-typedef struct rank_list {
-    int rank;
-    struct rank_list* next;
-} rank_list_t;
-
-size_t rank_list_contains(rank_list_t* list, int rank);
-
-void rank_list_clean(rank_list_t** list);
-
-void rank_list_sort(rank_list_t* list);
-
-void rank_list_keep_first_n(rank_list_t** origin_list, size_t n);
-
-void rank_list_add(rank_list_t** origin_list, int rank);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
index 7d525b7e5..6886e4732 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
@@ -55,6 +55,14 @@ char master_addr[MAX_KVS_NAME_LENGTH];
 #define GET_KEY "| sed -r 's/\"[a-zA-Z0-9_]*-|: \"[a-zA-Z0-9_-]*|,|\"| |//g'"
 #define GET_VAL "| sed -r 's/[a-zA-Z0-9_-]*\":|,|\"| |//g'"
 
+#define CHECK_STR(expr, str) \
+    do { \
+        if (!(expr)) { \
+            LOG_ERROR("wrong str: ", str); \
+            return KVS_STATUS_FAILURE; \
+        } \
+    } while (0)
+
 char run_get_template[RUN_TEMPLATE_SIZE];
 char run_set_template[RUN_TEMPLATE_SIZE];
 char job_name[MAX_KVS_NAME_LENGTH];
@@ -66,19 +74,22 @@ typedef enum manager_type {
 
 manager_type_t manager;
 
-size_t request_k8s_get_keys_values_by_name(const char* kvs_name,
-                                           char*** kvs_key,
-                                           char*** kvs_values);
+kvs_status_t request_k8s_get_keys_values_by_name(const char* kvs_name,
+                                                 char*** kvs_key,
+                                                 char*** kvs_values,
+                                                 int& values_count);
 
-size_t request_k8s_get_count_names(const char* kvs_name);
+kvs_status_t request_k8s_get_count_names(const char* kvs_name, size_t& res);
 
-size_t request_k8s_get_val_by_name_key(const char* kvs_name, const char* kvs_key, char* kvs_val);
+kvs_status_t request_k8s_get_val_by_name_key(const char* kvs_name,
+                                             const char* kvs_key,
+                                             char* kvs_val);
 
-size_t request_k8s_remove_name_key(const char* kvs_name, const char* kvs_key);
+kvs_status_t request_k8s_remove_name_key(const char* kvs_name, const char* kvs_key);
 
-size_t request_k8s_set_val(const char* kvs_name, const char* kvs_key, const char* kvs_val);
+kvs_status_t request_k8s_set_val(const char* kvs_name, const char* kvs_key, const char* kvs_val);
 
-void json_get_val(FILE* fp, const char** keys, size_t keys_count, char* val) {
+kvs_status_t json_get_val(FILE* fp, const char** keys, size_t keys_count, char* val) {
     char cur_kvs_str[MAX_KVS_STR_LENGTH];
     char* res;
     char last_char;
@@ -101,26 +112,33 @@ void json_get_val(FILE* fp, const char** keys, size_t keys_count, char* val) {
                 wrong_namespace_depth--;
         }
     }
-    res = strstr(cur_kvs_str, ":");
-    res++;
-    while (res[0] == ' ')
+    CHECK_STR(res = strstr(cur_kvs_str, ":"), cur_kvs_str);
+    do {
         res++;
+        CHECK_STR(res, cur_kvs_str);
+    } while (res[0] == ' ');
 
-    if (res[0] == '"' || res[0] == '\'')
+    if (res[0] == '"' || res[0] == '\'') {
         res++;
+        CHECK_STR(res, cur_kvs_str);
+    }
 
-    last_char = res[strlen(res) - 1];
+    int str_len = strlen(res) - 1;
+    last_char = res[str_len];
     while (last_char == '\n' || last_char == ',' || last_char == ' ' || last_char == '"' ||
            last_char == ' ') {
-        res[strlen(res) - 1] = '\0';
-        last_char = res[strlen(res) - 1];
+        res[str_len] = '\0';
+        str_len--;
+        CHECK_STR(str_len, cur_kvs_str);
+        last_char = res[str_len];
     }
     kvs_str_copy(val, res, MAX_KVS_VAL_LENGTH);
     while (fgets(cur_kvs_str, MAX_KVS_STR_LENGTH, fp)) {
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t k8s_init_with_manager() {
+kvs_status_t k8s_init_with_manager() {
     FILE* fp;
     FILE* fp_name;
     FILE* fp_type;
@@ -137,16 +155,21 @@ size_t k8s_init_with_manager() {
     char pod_name[MAX_KVS_VAL_LENGTH];
     memset(pod_name, '\0', MAX_KVS_VAL_LENGTH);
     if ((fp = popen("hostname", READ_ONLY)) == NULL) {
-        printf("Can't get hostname\n");
-        exit(1);
+        LOG_ERROR("Can't get hostname\n");
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(pod_name, MAX_KVS_VAL_LENGTH, fp), pod_name);
     pclose(fp);
-    while (pod_name[strlen(pod_name) - 1] == '\n' || pod_name[strlen(pod_name) - 1] == ' ')
-        pod_name[strlen(pod_name) - 1] = '\0';
+    int str_len = strlen(pod_name) - 1;
+    CHECK_STR(str_len, "hostname");
+    while (pod_name[str_len] == '\n' || pod_name[str_len] == ' ') {
+        pod_name[str_len] = '\0';
+        str_len--;
+        CHECK_STR(str_len, "hostname");
+    }
     if (kube_api_addr == NULL) {
-        printf("%s not set\n", CCL_K8S_API_ADDR_ENV);
-        return 1;
+        LOG_ERROR("%s not set\n", CCL_K8S_API_ADDR_ENV);
+        return KVS_STATUS_FAILURE;
     }
 
     SET_STR(connect_api_template, RUN_TEMPLATE_SIZE, ADDR_STR_V1_TEMPLATE, kube_api_addr);
@@ -156,10 +179,10 @@ size_t k8s_init_with_manager() {
 
     memset(kind_type, NULL_CHAR, MAX_KVS_NAME_LENGTH);
     if ((fp_name = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get kind_type\n");
-        exit(1);
+        LOG_ERROR("Can't get kind_type\n");
+        return KVS_STATUS_FAILURE;
     }
-    json_get_val(fp_name, kind_type_key, 3, kind_type);
+    KVS_CHECK_STATUS(json_get_val(fp_name, kind_type_key, 3, kind_type), "failed to get type");
 
     /*we must use the plural to access to statefulset/deployment KVS*/
     kind_type_size = strlen(kind_type);
@@ -170,10 +193,10 @@ size_t k8s_init_with_manager() {
 
     memset(kind_name, NULL_CHAR, MAX_KVS_NAME_LENGTH);
     if ((fp_type = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get kind_name\n");
-        exit(1);
+        LOG_ERROR("Can't get kind_name\n");
+        return KVS_STATUS_FAILURE;
     }
-    json_get_val(fp_type, kind_name_key, 3, kind_name);
+    KVS_CHECK_STATUS(json_get_val(fp_type, kind_name_key, 3, kind_name), "filed to get name");
 
     SET_STR(kind_path, MAX_KVS_NAME_LENGTH, "%s/%s", kind_type, kind_name);
     SET_STR(connect_api_template, RUN_TEMPLATE_SIZE, ADDR_STR_V2_TEMPLATE, kube_api_addr);
@@ -193,10 +216,10 @@ size_t k8s_init_with_manager() {
 
     pclose(fp_name);
     pclose(fp_type);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-void get_my_job_name(const char* connect_api_template) {
+kvs_status_t get_my_job_name(const char* connect_api_template) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char grep_kvs_name_key[REQUEST_POSTFIX_SIZE];
@@ -204,13 +227,18 @@ void get_my_job_name(const char* connect_api_template) {
     char pod_name[MAX_KVS_VAL_LENGTH];
     memset(pod_name, '\0', MAX_KVS_VAL_LENGTH);
     if ((fp = popen("hostname", READ_ONLY)) == NULL) {
-        printf("Can't get hostname\n");
-        exit(1);
+        LOG_ERROR("Can't get hostname\n");
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(pod_name, MAX_KVS_VAL_LENGTH, fp), pod_name);
     pclose(fp);
-    while (pod_name[strlen(pod_name) - 1] == '\n' || pod_name[strlen(pod_name) - 1] == ' ')
-        pod_name[strlen(pod_name) - 1] = '\0';
+    int str_len = strlen(pod_name) - 1;
+    CHECK_STR(str_len, "hostname");
+    while (pod_name[str_len] == '\n' || pod_name[str_len] == ' ') {
+        pod_name[str_len] = '\0';
+        str_len--;
+        CHECK_STR(str_len, "hostname");
+    }
 
     SET_STR(grep_kvs_name_key, REQUEST_POSTFIX_SIZE, GREP_TEMPLATE, JOB_NAME);
     SET_STR(
@@ -224,8 +252,8 @@ void get_my_job_name(const char* connect_api_template) {
             get_kvs_val);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get %s", strerror(errno));
-        exit(1);
+        LOG_ERROR("Can't get %s", strerror(errno));
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(job_name, MAX_KVS_NAME_LENGTH, fp), job_name);
     pclose(fp);
@@ -236,26 +264,32 @@ void get_my_job_name(const char* connect_api_template) {
     else {
         job_name[strlen(job_name) - 1] = '_';
     }
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t k8s_init_without_manager() {
+kvs_status_t k8s_init_without_manager() {
     FILE* fp;
     char* kube_api_addr = getenv(CCL_K8S_API_ADDR_ENV);
     char connect_api_template[RUN_TEMPLATE_SIZE];
     char pod_name[MAX_KVS_VAL_LENGTH];
     memset(pod_name, '\0', MAX_KVS_VAL_LENGTH);
     if ((fp = popen("hostname", READ_ONLY)) == NULL) {
-        printf("Can't get hostname\n");
-        exit(1);
+        LOG_ERROR("Can't get hostname\n");
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(pod_name, MAX_KVS_VAL_LENGTH, fp), pod_name);
     pclose(fp);
-    while (pod_name[strlen(pod_name) - 1] == '\n' || pod_name[strlen(pod_name) - 1] == ' ')
-        pod_name[strlen(pod_name) - 1] = '\0';
+    int str_len = strlen(pod_name) - 1;
+    CHECK_STR(str_len, "hostname");
+    while (pod_name[str_len] == '\n' || pod_name[str_len] == ' ') {
+        pod_name[str_len] = '\0';
+        str_len--;
+        CHECK_STR(str_len, "hostname");
+    }
 
     if (kube_api_addr == NULL) {
-        printf("%s not set\n", CCL_K8S_API_ADDR_ENV);
-        return 1;
+        LOG_ERROR("%s not set\n", CCL_K8S_API_ADDR_ENV);
+        return KVS_STATUS_FAILURE;
     }
 
     SET_STR(connect_api_template, RUN_TEMPLATE_SIZE, ADDR_STR_V1_TEMPLATE, kube_api_addr);
@@ -272,13 +306,12 @@ size_t k8s_init_without_manager() {
             pod_name,
             "%s");
 
-    get_my_job_name(connect_api_template);
+    KVS_CHECK_STATUS(get_my_job_name(connect_api_template), "failed to get job name");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_kvs_init() {
-    size_t res = 1;
+kvs_status_t request_k8s_kvs_init() {
     char* manager_type_env = getenv(CCL_K8S_MANAGER_TYPE_ENV);
 
     if (!manager_type_env || strstr(manager_type_env, "none")) {
@@ -288,7 +321,7 @@ size_t request_k8s_kvs_init() {
         manager = MT_K8S;
     }
     else {
-        printf(
+        LOG_WARN(
             "Unknown %s = %s, running with \"none\"\n", CCL_K8S_MANAGER_TYPE_ENV, manager_type_env);
         manager = MT_NONE;
     }
@@ -296,8 +329,11 @@ size_t request_k8s_kvs_init() {
     memset(job_name, NULL_CHAR, MAX_KVS_NAME_LENGTH);
 
     switch (manager) {
-        case MT_NONE: res = k8s_init_without_manager(); break;
-        case MT_K8S: res = k8s_init_with_manager(); break;
+        case MT_NONE:
+            KVS_CHECK_STATUS(k8s_init_without_manager(), "failed to initialize k8z");
+            break;
+        case MT_K8S: KVS_CHECK_STATUS(k8s_init_with_manager(), "failed to initialize k8z"); break;
+        default: LOG_ERROR("unknown k8s manager"); return KVS_STATUS_FAILURE;
     }
 
     memset(ccl_kvs_ip, NULL_CHAR, MAX_KVS_NAME_LENGTH);
@@ -310,35 +346,51 @@ size_t request_k8s_kvs_init() {
     SET_STR(req_kvs_ip, MAX_KVS_NAME_LENGTH, KVS_NAME_TEMPLATE_S, job_name, REQ_KVS_IP);
     SET_STR(master_addr, MAX_KVS_NAME_LENGTH, KVS_NAME_TEMPLATE_S, job_name, MASTER_ADDR);
 
-    return res;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_kvs_get_master(const char* local_host_ip, char* main_host_ip, char* port_str) {
+kvs_status_t request_k8s_kvs_get_master(const char* local_host_ip,
+                                        char* main_host_ip,
+                                        char* port_str) {
     char** kvs_values = NULL;
     char** kvs_keys = NULL;
     int values_count = 0;
 
-    request_k8s_set_val(ccl_kvs_ip, my_hostname, local_host_ip);
-    request_k8s_set_val(ccl_kvs_port, my_hostname, port_str);
-
-    if (!request_k8s_get_count_names(master_addr)) {
-        values_count = request_k8s_get_keys_values_by_name(ccl_kvs_ip, &kvs_keys, &kvs_values);
+    KVS_CHECK_STATUS(request_k8s_set_val(ccl_kvs_ip, my_hostname, local_host_ip),
+                     "failed to set IP");
+    KVS_CHECK_STATUS(request_k8s_set_val(ccl_kvs_port, my_hostname, port_str),
+                     "failed to set port");
+    size_t count;
+    KVS_CHECK_STATUS(request_k8s_get_count_names(master_addr, count), "failed to get names count");
+    if (count == 0) {
+        KVS_CHECK_STATUS(
+            request_k8s_get_keys_values_by_name(ccl_kvs_ip, &kvs_keys, &kvs_values, values_count),
+            "failed to get keys");
         if (strstr(kvs_keys[0], my_hostname)) {
-            request_k8s_set_val(req_kvs_ip, my_hostname, local_host_ip);
-            while (!request_k8s_get_count_names(master_addr)) {
-                values_count =
-                    request_k8s_get_keys_values_by_name(req_kvs_ip, &kvs_keys, &kvs_values);
+            KVS_CHECK_STATUS(request_k8s_set_val(req_kvs_ip, my_hostname, local_host_ip),
+                             "failed to set IP");
+            KVS_CHECK_STATUS(request_k8s_get_count_names(master_addr, count),
+                             "failed to get names count");
+            while (count == 0) {
+                KVS_CHECK_STATUS(request_k8s_get_keys_values_by_name(
+                                     req_kvs_ip, &kvs_keys, &kvs_values, values_count),
+                                 "failed to get keys values");
                 if (values_count > 1) {
                     if (!strstr(kvs_keys[0], my_hostname)) {
                         break;
                     }
                 }
                 else {
-                    request_k8s_set_val(master_addr, KVS_IP, local_host_ip);
-                    request_k8s_set_val(master_addr, KVS_PORT, port_str);
+                    KVS_CHECK_STATUS(request_k8s_set_val(master_addr, KVS_IP, local_host_ip),
+                                     "failed to set IP");
+                    KVS_CHECK_STATUS(request_k8s_set_val(master_addr, KVS_PORT, port_str),
+                                     "failed to set port");
                 }
+                KVS_CHECK_STATUS(request_k8s_get_count_names(master_addr, count),
+                                 "failed to get names count");
             }
-            request_k8s_remove_name_key(req_kvs_ip, my_hostname);
+            KVS_CHECK_STATUS(request_k8s_remove_name_key(req_kvs_ip, my_hostname),
+                             "failed to remove host info");
         }
         if (kvs_keys != NULL) {
             for (int i = 0; i < values_count; i++) {
@@ -353,29 +405,36 @@ size_t request_k8s_kvs_get_master(const char* local_host_ip, char* main_host_ip,
             free(kvs_values);
         }
     }
-    while (!request_k8s_get_count_names(master_addr)) {
+    do {
+        KVS_CHECK_STATUS(request_k8s_get_count_names(master_addr, count),
+                         "failed to get names count");
         sleep(1);
-    }
-    request_k8s_get_val_by_name_key(master_addr, KVS_IP, main_host_ip);
-    request_k8s_get_val_by_name_key(master_addr, KVS_PORT, port_str);
-    return 0;
+    } while (count == 0);
+    KVS_CHECK_STATUS(request_k8s_get_val_by_name_key(master_addr, KVS_IP, main_host_ip),
+                     "failed to get IP");
+    KVS_CHECK_STATUS(request_k8s_get_val_by_name_key(master_addr, KVS_PORT, port_str),
+                     "failed to get port");
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_kvs_finalize(size_t is_master) {
-    request_k8s_remove_name_key(ccl_kvs_ip, my_hostname);
-    request_k8s_remove_name_key(ccl_kvs_port, my_hostname);
+kvs_status_t request_k8s_kvs_finalize(size_t is_master) {
+    KVS_CHECK_STATUS(request_k8s_remove_name_key(ccl_kvs_ip, my_hostname), "failed to remove IP");
+    KVS_CHECK_STATUS(request_k8s_remove_name_key(ccl_kvs_port, my_hostname),
+                     "failed to remove port");
     if (is_master) {
-        request_k8s_remove_name_key(master_addr, KVS_IP);
-        request_k8s_remove_name_key(master_addr, KVS_PORT);
+        KVS_CHECK_STATUS(request_k8s_remove_name_key(master_addr, KVS_IP),
+                         "failed to remove master IP");
+        KVS_CHECK_STATUS(request_k8s_remove_name_key(master_addr, KVS_PORT),
+                         "failed to remove master IP");
     }
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t get_by_template(char*** kvs_entry,
-                       const char* request,
-                       const char* template_str,
-                       int count,
-                       int max_count) {
+kvs_status_t get_by_template(char*** kvs_entry,
+                             const char* request,
+                             const char* template_str,
+                             int count,
+                             int max_count) {
     FILE* fp;
     char get_val[REQUEST_POSTFIX_SIZE];
     char run_str[RUN_REQUEST_SIZE];
@@ -386,14 +445,14 @@ size_t get_by_template(char*** kvs_entry,
 
     *kvs_entry = (char**)malloc(sizeof(char*) * count);
     if (*kvs_entry == NULL) {
-        printf("Memory allocation failed\n");
-        exit(1);
+        LOG_ERROR("Memory allocation failed\n");
+        return KVS_STATUS_FAILURE;
     }
     for (i = 0; i < count; i++) {
         (*kvs_entry)[i] = (char*)malloc(sizeof(char) * max_count);
         if ((*kvs_entry)[i] == NULL) {
-            printf("Memory allocation failed\n");
-            exit(1);
+            LOG_ERROR("Memory allocation failed\n");
+            return KVS_STATUS_FAILURE;
         }
     }
 
@@ -402,8 +461,8 @@ size_t get_by_template(char*** kvs_entry,
     SET_STR(get_val, REQUEST_POSTFIX_SIZE, CONCAT_TWO_COMMAND_TEMPLATE, request, template_str);
     SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, get_val);
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get by template\n");
-        exit(1);
+        LOG_ERROR("Can't get by template\n");
+        return KVS_STATUS_FAILURE;
     }
     while ((fgets((*kvs_entry)[i], max_count, fp) != NULL) && (i < count)) {
         while ((*kvs_entry)[i][strlen((*kvs_entry)[i]) - 1] == '\n' ||
@@ -412,18 +471,19 @@ size_t get_by_template(char*** kvs_entry,
         i++;
     }
     pclose(fp);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_get_keys_values_by_name(const char* kvs_name,
-                                           char*** kvs_keys,
-                                           char*** kvs_values) {
+kvs_status_t request_k8s_get_keys_values_by_name(const char* kvs_name,
+                                                 char*** kvs_keys,
+                                                 char*** kvs_values,
+                                                 int& values_count) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char grep_name_str[REQUEST_POSTFIX_SIZE];
     char get_name_count[REQUEST_POSTFIX_SIZE];
     char values_count_str[INT_STR_SIZE];
-    size_t values_count;
+    values_count = 0;
 
     SET_STR(get_name_count, REQUEST_POSTFIX_SIZE, GREP_COUNT_TEMPLATE, kvs_name);
 
@@ -431,30 +491,36 @@ size_t request_k8s_get_keys_values_by_name(const char* kvs_name,
     SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, get_name_count);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get keys-values by name: %s\n", kvs_name);
-        exit(1);
+        LOG_ERROR("Can't get keys-values by name: %s\n", kvs_name);
+        return KVS_STATUS_SUCCESS;
     }
     CHECK_FGETS(fgets(values_count_str, INT_STR_SIZE, fp), values_count_str);
     pclose(fp);
 
-    if ((values_count = safe_strtol(values_count_str, NULL, 10)) == 0)
-        return 0;
+    KVS_CHECK_STATUS(safe_strtol(values_count_str, values_count), "failed to convert count");
+    if (values_count == 0)
+        return KVS_STATUS_SUCCESS;
 
     SET_STR(grep_name_str, REQUEST_POSTFIX_SIZE, GREP_TEMPLATE, kvs_name);
     if (kvs_values != NULL) {
-        get_by_template(kvs_values, grep_name_str, GET_VAL, values_count, MAX_KVS_VAL_LENGTH);
+        KVS_CHECK_STATUS(
+            get_by_template(kvs_values, grep_name_str, GET_VAL, values_count, MAX_KVS_VAL_LENGTH),
+            "failed to get val");
     }
     if (kvs_keys != NULL) {
-        get_by_template(kvs_keys, grep_name_str, GET_KEY, values_count, MAX_KVS_KEY_LENGTH);
+        KVS_CHECK_STATUS(
+            get_by_template(kvs_keys, grep_name_str, GET_KEY, values_count, MAX_KVS_KEY_LENGTH),
+            "failed to get key");
     }
-    return values_count;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_get_count_names(const char* kvs_name) {
+kvs_status_t request_k8s_get_count_names(const char* kvs_name, size_t& res) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char get_count_str[REQUEST_POSTFIX_SIZE];
     char count_names[INT_STR_SIZE];
+    res = 0;
 
     SET_STR(get_count_str, REQUEST_POSTFIX_SIZE, GREP_COUNT_TEMPLATE, kvs_name);
 
@@ -462,16 +528,19 @@ size_t request_k8s_get_count_names(const char* kvs_name) {
     SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, get_count_str);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get names count: %s\n", kvs_name);
-        exit(1);
+        LOG_ERROR("Can't get names count: %s\n", kvs_name);
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(count_names, INT_STR_SIZE, fp), count_names);
     pclose(fp);
 
-    return safe_strtol(count_names, NULL, 10);
+    KVS_CHECK_STATUS(safe_strtol(count_names, res), "failed to convert cont names");
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_get_val_by_name_key(const char* kvs_name, const char* kvs_key, char* kvs_val) {
+kvs_status_t request_k8s_get_val_by_name_key(const char* kvs_name,
+                                             const char* kvs_key,
+                                             char* kvs_val) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char grep_kvs_name_key[REQUEST_POSTFIX_SIZE];
@@ -487,16 +556,16 @@ size_t request_k8s_get_val_by_name_key(const char* kvs_name, const char* kvs_key
     SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, get_kvs_val);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't get value by name-key: %s\n", kvs_name_key);
-        exit(1);
+        LOG_ERROR("Can't get value by name-key: %s\n", kvs_name_key);
+        return KVS_STATUS_FAILURE;
     }
     CHECK_FGETS(fgets(kvs_val, MAX_KVS_VAL_LENGTH, fp), kvs_val);
     pclose(fp);
     kvs_val[strlen(kvs_val) - 1] = NULL_CHAR;
-    return strlen(kvs_val);
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_remove_name_key(const char* kvs_name, const char* kvs_key) {
+kvs_status_t request_k8s_remove_name_key(const char* kvs_name, const char* kvs_key) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char patch[REQUEST_POSTFIX_SIZE];
@@ -509,14 +578,14 @@ size_t request_k8s_remove_name_key(const char* kvs_name, const char* kvs_key) {
     SET_STR(run_str, RUN_REQUEST_SIZE, run_set_template, patch);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't remove name-key: %s\n", kvs_name_key);
-        exit(1);
+        LOG_ERROR("Can't remove name-key: %s\n", kvs_name_key);
+        return KVS_STATUS_FAILURE;
     }
     pclose(fp);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_set_val(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
+kvs_status_t request_k8s_set_val(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char patch[REQUEST_POSTFIX_SIZE];
@@ -529,21 +598,21 @@ size_t request_k8s_set_val(const char* kvs_name, const char* kvs_key, const char
     SET_STR(run_str, RUN_REQUEST_SIZE, run_set_template, patch);
 
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
-        printf("Can't set name-key-val: %s-%s\n", kvs_name_key, kvs_val);
-        exit(1);
+        LOG_ERROR("Can't set name-key-val: %s-%s\n", kvs_name_key, kvs_val);
+        return KVS_STATUS_FAILURE;
     }
     pclose(fp);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-size_t request_k8s_get_replica_size(void) {
+kvs_status_t request_k8s_get_replica_size(size_t& res) {
     FILE* fp;
     char run_str[RUN_REQUEST_SIZE];
     char replica_size_str[MAX_KVS_VAL_LENGTH];
     const char* replica_keys[] = { "spec", "replicas" };
 
     switch (manager) {
-        case MT_NONE: return request_k8s_get_count_names(ccl_kvs_ip);
+        case MT_NONE: return request_k8s_get_count_names(ccl_kvs_ip, res);
         case MT_K8S:
             /*get full output*/
             SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, "");
@@ -552,9 +621,12 @@ size_t request_k8s_get_replica_size(void) {
                 printf("Can't get replica size\n");
                 exit(1);
             }
-            json_get_val(fp, replica_keys, 2, replica_size_str);
+            KVS_CHECK_STATUS(json_get_val(fp, replica_keys, 2, replica_size_str),
+                             "failed to get replica size");
             pclose(fp);
-            return safe_strtol(replica_size_str, NULL, 10);
+            KVS_CHECK_STATUS(safe_strtol(replica_size_str, res), "failed to convert replica size");
+            return KVS_STATUS_SUCCESS;
+        default: LOG_ERROR("unknown k8s manager"); return KVS_STATUS_FAILURE;
     }
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp
index 86bdb7705..f9eaf6400 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp
@@ -21,13 +21,15 @@ extern "C" {
 #endif
 #include <stddef.h>
 
-size_t request_k8s_kvs_init(void);
+kvs_status_t request_k8s_kvs_init(void);
 
-size_t request_k8s_kvs_get_master(const char* local_host_ip, char* main_host_ip, char* port_str);
+kvs_status_t request_k8s_kvs_get_master(const char* local_host_ip,
+                                        char* main_host_ip,
+                                        char* port_str);
 
-size_t request_k8s_kvs_finalize(size_t is_master);
+kvs_status_t request_k8s_kvs_finalize(size_t is_master);
 
-size_t request_k8s_get_replica_size(void);
+kvs_status_t request_k8s_get_replica_size(size_t& res);
 
 #ifdef __cplusplus
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
index 085bf0e8d..e5266535d 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
@@ -27,21 +27,31 @@ char my_hostname[MAX_KVS_VAL_LENGTH];
 // TODO: rework it for multi kvs
 static pmi_resizable* pmi_object;
 
-void Call_Hard_finilize(int sig) {
-    pmi_object->Hard_finilize(sig);
+void call_hard_finalize(int sig) {
+    if (pmi_object->hard_finalize(sig) != KVS_STATUS_SUCCESS) {
+        LOG_ERROR("failed to hard finalize");
+    }
 }
 
 kvs_resize_action_t pmi_resizable::default_checker(int comm_size) {
     char* comm_size_to_start_env;
-    int comm_size_to_start;
+    size_t comm_size_to_start;
 
     comm_size_to_start_env = getenv(CCL_WORLD_SIZE_ENV);
 
-    if (comm_size_to_start_env != NULL)
-        comm_size_to_start = safe_strtol(comm_size_to_start_env, NULL, 10);
-    else
-        comm_size_to_start = h->get_replica_size();
-    if (comm_size >= comm_size_to_start)
+    if (comm_size_to_start_env != NULL) {
+        if (safe_strtol(comm_size_to_start_env, comm_size_to_start) != KVS_STATUS_SUCCESS) {
+            LOG_ERROR("failed to convert comm_size");
+            return KVS_RA_FINALIZE;
+        }
+    }
+    else {
+        if (h->get_replica_size(comm_size_to_start) != KVS_STATUS_SUCCESS) {
+            LOG_ERROR("failed to get comm_size");
+            return KVS_RA_FINALIZE;
+        }
+    }
+    if (comm_size >= static_cast<int>(comm_size_to_start))
         return KVS_RA_RUN;
 
     return KVS_RA_WAIT;
@@ -54,27 +64,29 @@ kvs_resize_action_t pmi_resizable::call_resize_fn(int comm_size) {
     return default_checker(comm_size);
 }
 
-int pmi_resizable::PMIR_Update(void) {
+kvs_status_t pmi_resizable::PMIR_Update(void) {
     char up_idx_str[MAX_KVS_VAL_LENGTH];
     int prev_new_ranks_count = 0;
     int prev_killed_ranks_count = 0;
     int prev_idx = -1;
     kvs_resize_action_t answer;
-    rank_list_t* dead_up_idx = NULL;
-    shift_list_t* list = NULL;
+    std::list<int> dead_up_idx{};
+    std::list<shift_rank_t> list{};
 
     new_ranks_count = 0;
     killed_ranks_count = 0;
     if (finalized == 1) {
-        return 1;
+        LOG_ERROR("is finalized");
+        return KVS_STATUS_FAILURE;
     }
     if (applied == 1) {
         size_t is_wait = 1;
         size_t is_first_collect = 0;
 
-        h->get_value_by_name_key(KVS_UP, KVS_IDX, up_idx_str);
+        KVS_CHECK_STATUS(h->get_value_by_name_key(KVS_UP, KVS_IDX, up_idx_str),
+                         "failed to get KVS IDx");
 
-        up_idx = safe_strtol(up_idx_str, NULL, 10);
+        KVS_CHECK_STATUS(safe_strtol(up_idx_str, up_idx), "failed to convert KVS IDx");
         if (up_idx == 0)
             is_first_collect = 1;
 
@@ -84,9 +96,10 @@ int pmi_resizable::PMIR_Update(void) {
             do {
                 /*Waiting new pods*/
                 usleep(10000);
-                h->get_value_by_name_key(KVS_UP, KVS_IDX, up_idx_str);
+                KVS_CHECK_STATUS(h->get_value_by_name_key(KVS_UP, KVS_IDX, up_idx_str),
+                                 "failed to get KVS IDx");
 
-                up_idx = safe_strtol(up_idx_str, NULL, 10);
+                KVS_CHECK_STATUS(safe_strtol(up_idx_str, up_idx), "failed to convert KVS IDx");
                 if (prev_idx == (int)up_idx) {
                     count_clean_checks = 0;
 
@@ -101,7 +114,7 @@ int pmi_resizable::PMIR_Update(void) {
                     //                    while (int_list_is_contained(killed_ranks, root_rank) == 1)
                     {
                         int old_root = root_rank;
-                        h->get_new_root(&root_rank);
+                        KVS_CHECK_STATUS(h->get_new_root(&root_rank), "failed to new root rank");
 
                         if (my_rank == root_rank && old_root != root_rank)
                             is_new_root = 1;
@@ -114,27 +127,29 @@ int pmi_resizable::PMIR_Update(void) {
                     prev_new_ranks_count = new_ranks_count;
                     prev_killed_ranks_count = killed_ranks_count;
 
-                    h->get_update_ranks();
+                    KVS_CHECK_STATUS(h->get_update_ranks(), "failed to update ranks");
                     if (killed_ranks_count != prev_killed_ranks_count)
-                        rank_list_add(&dead_up_idx, up_idx);
+                        dead_up_idx.push_back(up_idx);
                 }
-                PMIR_Barrier();
+                KVS_CHECK_STATUS(PMIR_Barrier(), "barrier failed");
                 if (my_rank == root_rank && is_new_root == 0) {
                     up_idx++;
                     if (up_idx > 0 && up_idx > MAX_UP_IDX)
                         up_idx = 1;
 
                     SET_STR(up_idx_str, INT_STR_SIZE, SIZE_T_TEMPLATE, up_idx);
-                    h->set_value(KVS_UP, KVS_IDX, up_idx_str);
-                    h->up_kvs_new_and_dead();
+                    KVS_CHECK_STATUS(h->set_value(KVS_UP, KVS_IDX, up_idx_str),
+                                     "failed to set KVS IDx");
+                    KVS_CHECK_STATUS(h->up_kvs_new_and_dead(), "failed to update KVS");
                 }
-                PMIR_Barrier();
+                KVS_CHECK_STATUS(PMIR_Barrier(), "barrier failed");
 
                 if (finalized == 1) {
-                    rank_list_clean(&killed_ranks);
-                    rank_list_clean(&new_ranks);
-                    rank_list_clean(&dead_up_idx);
-                    return 1;
+                    killed_ranks.clear();
+                    new_ranks.clear();
+                    dead_up_idx.clear();
+                    LOG_ERROR("is finalized")
+                    return KVS_STATUS_FAILURE;
                 }
 
                 is_new_root = 0;
@@ -151,7 +166,9 @@ int pmi_resizable::PMIR_Update(void) {
             if (!is_first_collect || ask_only_framework == 1)
                 answer = call_resize_fn(count_pods - killed_ranks_count + new_ranks_count);
             else {
-                if ((int)(h->get_replica_size()) !=
+                size_t replica_size;
+                KVS_CHECK_STATUS(h->get_replica_size(replica_size), "failed to get replica size");
+                if (static_cast<int>(replica_size) !=
                     count_pods - killed_ranks_count + new_ranks_count)
                     answer = KVS_RA_WAIT;
                 else
@@ -167,60 +184,60 @@ int pmi_resizable::PMIR_Update(void) {
                     break;
                 }
                 case KVS_RA_FINALIZE: {
-                    PMIR_Finalize();
-                    return 1;
+                    KVS_CHECK_STATUS(PMIR_Finalize(), "failed to finalize");
                 }
                 default: {
-                    printf("Unknown resize action: %d\n", answer);
-                    PMIR_Finalize();
-                    return 1;
+                    LOG_ERROR("Unknown resize action: %d\n", answer);
+                    KVS_CHECK_STATUS(PMIR_Finalize(), "failed to finalize");
+                    return KVS_STATUS_FAILURE;
                 }
             }
             listener.set_applied_count(count_applied_changes);
         } while (is_wait == 1);
     }
     else {
-        listener.send_notification(0, h);
-        h->wait_accept();
+        KVS_CHECK_STATUS(listener.send_notification(0, h), "failed to send notification");
+        KVS_CHECK_STATUS(h->wait_accept(), "failed to wait accept");
     }
 
-    h->get_shift(&list);
+    h->get_shift(list);
     count_pods = count_pods - killed_ranks_count + new_ranks_count;
-    h->update(&list, &dead_up_idx, root_rank);
+    KVS_CHECK_STATUS(h->update(list, dead_up_idx, root_rank), "failed to update root");
 
     root_rank = 0;
 
-    PMIR_Barrier();
-    h->up_pods_count();
+    KVS_CHECK_STATUS(PMIR_Barrier(), "barrier failed");
+    KVS_CHECK_STATUS(h->up_pods_count(), "failed to update pods count");
 
-    rank_list_clean(&killed_ranks);
-    rank_list_clean(&new_ranks);
-    rank_list_clean(&dead_up_idx);
-    shift_list_clean(&list);
-    return 0;
+    killed_ranks.clear();
+    new_ranks.clear();
+    dead_up_idx.clear();
+    list.clear();
+    return KVS_STATUS_SUCCESS;
 }
 
-void pmi_resizable::Hard_finilize(int sig) {
+kvs_status_t pmi_resizable::hard_finalize(int sig) {
     char rank_str[INT_STR_SIZE];
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
-    h->set_value(KVS_DEAD_POD, my_hostname, rank_str);
+    KVS_CHECK_STATUS(h->set_value(KVS_DEAD_POD, my_hostname, rank_str), "failed to set dead rank");
 
-    listener.send_notification(sig, h);
+    KVS_CHECK_STATUS(listener.send_notification(sig, h), "failed to send notification");
 
     extreme_finalize = 1;
-    PMIR_Finalize();
+    KVS_CHECK_STATUS(PMIR_Finalize(), "failed to finalize");
     if (old_act.sa_handler != NULL)
         old_act.sa_handler(sig);
+
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Main_Addr_Reserve(char* main_addr) {
-    h->main_server_address_reserve(main_addr);
-    return 0;
+kvs_status_t pmi_resizable::PMIR_Main_Addr_Reserve(char* main_addr) {
+    return h->main_server_address_reserve(main_addr);
 }
 
-int pmi_resizable::PMIR_Init(const char* main_addr) {
+kvs_status_t pmi_resizable::PMIR_Init(const char* main_addr) {
     struct sigaction act;
     FILE* fp;
     finalized = 0;
@@ -240,34 +257,34 @@ int pmi_resizable::PMIR_Init(const char* main_addr) {
             "-%d",
             getpid());
 
-    if (h->init(main_addr)) {
-        return 1;
-    }
+    KVS_CHECK_STATUS(h->init(main_addr), "failed to init");
 
-    h->reg_rank();
+    KVS_CHECK_STATUS(h->reg_rank(), "failed to rank register");
 
-    h->up_pods_count();
+    KVS_CHECK_STATUS(h->up_pods_count(), "failed to update pods count");
 
     // TODO: rework it for multi kvs
     pmi_object = this;
     memset(&act, 0, sizeof(act));
-    act.sa_handler = &Call_Hard_finilize;
+    act.sa_handler = &call_hard_finalize;
     act.sa_flags = 0;
     sigaction(SIGTERM, &act, &old_act);
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Finalize(void) {
+kvs_status_t pmi_resizable::PMIR_Finalize(void) {
     char kvs_name[MAX_KVS_NAME_LENGTH];
     char kvs_key[MAX_KVS_KEY_LENGTH];
     char kvs_val[MAX_KVS_VAL_LENGTH];
     char rank_str[INT_STR_SIZE];
-    if (finalized)
-        return 0;
+    if (finalized) {
+        return KVS_STATUS_SUCCESS;
+    }
 
-    if (my_rank == 0)
-        PMIR_Barrier();
+    if (my_rank == 0) {
+        KVS_CHECK_STATUS(PMIR_Barrier(), "barrier failed");
+    }
 
     finalized = 1;
 
@@ -275,101 +292,106 @@ int pmi_resizable::PMIR_Finalize(void) {
 
     SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
-    h->remove_name_key(KVS_POD_NUM, rank_str);
+    KVS_CHECK_STATUS(h->remove_name_key(KVS_POD_NUM, rank_str), "failed to remove rank");
 
     while (cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT)) {
-        h->remove_name_key(kvs_name, kvs_key);
+        KVS_CHECK_STATUS(h->remove_name_key(kvs_name, kvs_key), "failed to remove info");
     }
 
     if (my_rank == 0 && extreme_finalize != 1) {
-        h->remove_name_key(KVS_UP, KVS_IDX);
+        KVS_CHECK_STATUS(h->remove_name_key(KVS_UP, KVS_IDX), "failed to remove IDx");
     }
-    h->remove_name_key(KVS_BARRIER, my_hostname);
+    KVS_CHECK_STATUS(h->remove_name_key(KVS_BARRIER, my_hostname), "failed to remove barrier info");
 
-    h->finalize();
+    KVS_CHECK_STATUS(h->finalize(), "failed to finalize");
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Barrier(void) {
+kvs_status_t pmi_resizable::PMIR_Barrier(void) {
     size_t min_barrier_num;
     char barrier_num_str[INT_STR_SIZE];
 
     if (finalized)
-        return 0;
+        return KVS_STATUS_SUCCESS;
 
     SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    h->set_value(KVS_BARRIER, my_hostname, barrier_num_str);
+    KVS_CHECK_STATUS(h->set_value(KVS_BARRIER, my_hostname, barrier_num_str),
+                     "failed to set barrier info");
 
-    min_barrier_num = h->get_barrier_idx();
+    KVS_CHECK_STATUS(h->get_barrier_idx(min_barrier_num), "failed to get barrier IDx");
     while (min_barrier_num != barrier_num && finalized != 1) {
-        min_barrier_num = h->get_barrier_idx();
+        KVS_CHECK_STATUS(h->get_barrier_idx(min_barrier_num), "failed to get barrier IDx");
     }
 
     barrier_num++;
     if (barrier_num > BARRIER_NUM_MAX)
         barrier_num = 0;
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Get_size(int* size) {
+kvs_status_t pmi_resizable::PMIR_Get_size(int* size) {
     *size = count_pods;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Get_rank(int* rank) {
+kvs_status_t pmi_resizable::PMIR_Get_rank(int* rank) {
     *rank = my_rank;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Get_my_name(char* kvs_name, size_t length) {
+kvs_status_t pmi_resizable::PMIR_KVS_Get_my_name(char* kvs_name, size_t length) {
     kvs_str_copy(kvs_name, KVS_NAME, length);
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Get_name_length_max(size_t* length) {
+kvs_status_t pmi_resizable::PMIR_KVS_Get_name_length_max(size_t* length) {
     *length = MAX_KVS_NAME_LENGTH;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Get_key_length_max(size_t* length) {
+kvs_status_t pmi_resizable::PMIR_KVS_Get_key_length_max(size_t* length) {
     *length = MAX_KVS_KEY_LENGTH;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Get_value_length_max(size_t* length) {
+kvs_status_t pmi_resizable::PMIR_KVS_Get_value_length_max(size_t* length) {
     *length = MAX_KVS_VAL_LENGTH;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Commit(const char* kvs_name) {
+kvs_status_t pmi_resizable::PMIR_KVS_Commit(const char* kvs_name) {
     (void)kvs_name;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Put(const char* kvs_name, const char* key, const char* value) {
+kvs_status_t pmi_resizable::PMIR_KVS_Put(const char* kvs_name, const char* key, const char* value) {
     put_key(kvs_name, key, value, ST_CLIENT);
 
-    h->set_value(kvs_name, key, value);
-    return 0;
+    KVS_CHECK_STATUS(h->set_value(kvs_name, key, value), "failed to set value");
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_KVS_Get(const char* kvs_name, const char* key, char* value, size_t length) {
+kvs_status_t pmi_resizable::PMIR_KVS_Get(const char* kvs_name,
+                                         const char* key,
+                                         char* value,
+                                         size_t length) {
     (void)length;
-    while (h->get_value_by_name_key(kvs_name, key, value) == 0) {
-    }
+    do {
+        KVS_CHECK_STATUS(h->get_value_by_name_key(kvs_name, key, value), "failed to get value");
+    } while (strlen(value) == 0);
 
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_set_resize_function(pmir_resize_fn_t resize_fn) {
+kvs_status_t pmi_resizable::PMIR_set_resize_function(pmir_resize_fn_t resize_fn) {
     resize_function = resize_fn;
-    return 0;
+    return KVS_STATUS_SUCCESS;
 }
 
-int pmi_resizable::PMIR_Wait_notification(void) {
+kvs_status_t pmi_resizable::PMIR_Wait_notification(void) {
     return listener.run_listener(h);
 }
 
@@ -384,11 +406,15 @@ int pmi_resizable::get_size() {
 size_t pmi_resizable::get_local_thread_idx() {
     return 0;
 }
-size_t pmi_resizable::get_local_kvs_id() {
-    return 0;
+atl_status_t pmi_resizable::get_local_kvs_id(size_t& res) {
+    res = 0;
+    return ATL_STATUS_SUCCESS;
+}
+atl_status_t pmi_resizable::set_local_kvs_id(size_t local_kvs_id) {
+    return ATL_STATUS_SUCCESS;
 }
-void pmi_resizable::set_local_kvs_id(size_t local_kvs_id) {}
 pmi_resizable::~pmi_resizable() {
-    if (!is_finalized)
-        pmrt_finalize();
+    if (!is_finalized) {
+        CCL_THROW_IF_NOT(pmrt_finalize(), "pmi finalize failed");
+    }
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h
deleted file mode 100644
index b9ef14a62..000000000
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#ifndef PMIR_H_INCLUDED
-#define PMIR_H_INCLUDED
-
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define PMIR_API __attribute__((visibility("default")))
-
-#define PMIR_SUCCESS                0
-#define PMIR_FAIL                   -1
-#define PMIR_ERR_INIT               1
-#define PMIR_ERR_NOMEM              2
-#define PMIR_ERR_INVALID_ARG        3
-#define PMIR_ERR_INVALID_KEY        4
-#define PMIR_ERR_INVALID_KEY_LENGTH 5
-#define PMIR_ERR_INVALID_VAL        6
-#define PMIR_ERR_INVALID_VAL_LENGTH 7
-#define PMIR_ERR_INVALID_LENGTH     8
-#define PMIR_ERR_INVALID_NUM_ARGS   9
-#define PMIR_ERR_INVALID_ARGS       10
-#define PMIR_ERR_INVALID_NUM_PARSED 11
-#define PMIR_ERR_INVALID_KEYVALP    12
-#define PMIR_ERR_INVALID_SIZE       13
-
-typedef enum {
-    KVS_RA_WAIT = 0,
-    KVS_RA_RUN = 1,
-    KVS_RA_FINALIZE = 2,
-} kvs_resize_action_t;
-typedef kvs_resize_action_t (*pmir_resize_fn_t)(int comm_size);
-
-int PMIR_API PMIR_Main_Addr_Reserve(char* main_addr);
-
-int PMIR_API PMIR_Init(const char* main_addr);
-
-int PMIR_API PMIR_Finalize(void);
-
-int PMIR_API PMIR_Get_size(int* size);
-
-int PMIR_API PMIR_Get_rank(int* rank);
-
-int PMIR_API PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
-
-int PMIR_API PMIR_KVS_Get_name_length_max(size_t* length);
-
-int PMIR_API PMIR_Barrier(void);
-
-int PMIR_API PMIR_Update(void);
-
-int PMIR_API PMIR_KVS_Get_key_length_max(size_t* length);
-
-int PMIR_API PMIR_KVS_Get_value_length_max(size_t* length);
-
-int PMIR_API PMIR_KVS_Put(const char* kvs_name, const char* key, const char* value);
-
-int PMIR_API PMIR_KVS_Commit(const char* kvs_name);
-
-int PMIR_API PMIR_KVS_Get(const char* kvs_name, const char* key, char* value, size_t length);
-
-int PMIR_API PMIR_set_resize_function(pmir_resize_fn_t resize_fn);
-
-int PMIR_API PMIR_Wait_notification(void);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
deleted file mode 100644
index 30fe48722..000000000
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "shift_list.hpp"
-
-void shift_list_clean(shift_list_t** list) {
-    shift_list_t* cur_list = (*list);
-    shift_list_t* node_to_remove;
-    while (cur_list != NULL) {
-        node_to_remove = cur_list;
-        cur_list = cur_list->next;
-        free(node_to_remove);
-    }
-    (*list) = NULL;
-}
-
-void shift_list_add(shift_list_t** list, int old_rank, int new_rank, change_type_t type) {
-    shift_list_t* cur_list;
-    if ((*list) == NULL) {
-        (*list) = (shift_list_t*)malloc(sizeof(shift_list_t));
-        if ((*list) == NULL) {
-            printf("Memory allocation failed\n");
-            return;
-        }
-        cur_list = (*list);
-    }
-    else {
-        cur_list = (*list);
-        while (cur_list->next != NULL)
-            cur_list = cur_list->next;
-        cur_list->next = (shift_list_t*)malloc(sizeof(shift_list_t));
-        cur_list = cur_list->next;
-    }
-    cur_list->shift.old_rank = old_rank;
-    cur_list->shift.new_rank = new_rank;
-    cur_list->shift.type = type;
-    cur_list->next = NULL;
-}
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
index 59f3d6a5e..876cc1e06 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
@@ -13,12 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#ifndef SHIFT_LIST_H_INCLUDED
-#define SHIFT_LIST_H_INCLUDED
-
-#ifdef __cplusplus
-extern "C" {
-#endif
+#pragma once
 typedef enum change_type {
     CH_T_SHIFT = 0,
     CH_T_DEAD = 1,
@@ -31,17 +26,3 @@ typedef struct shift_rank {
     int new_rank;
     change_type_t type;
 } shift_rank_t;
-
-typedef struct shift_list {
-    shift_rank_t shift;
-    struct shift_list* next;
-} shift_list_t;
-
-void shift_list_clean(shift_list_t** list);
-
-void shift_list_add(shift_list_t** list, int old_rank, int new_rank, change_type_t type);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c
deleted file mode 100644
index 4cfc14dc3..000000000
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "pm_rt_codec.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "pmi_resizable/resizable_pmi.h"
-
-#include "pm_rt.h"
-
-#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%d"
-
-typedef struct resizable_pm_rt_context {
-    pm_rt_desc_t pmrt_desc;
-    struct {
-        size_t initialized;
-        size_t ref_cnt;
-        size_t max_keylen;
-        size_t max_vallen;
-        char *key_storage;
-        char *val_storage;
-        char *kvsname;
-    } resizablert_main;
-} resizable_pm_context_t;
-
-/* Ensures that this is allocated/initialized only once per process */
-static resizable_pm_context_t resizable_ctx_singleton;
-
-static void resizable_pmirt_finalize(pm_rt_desc_t *pmrt_desc) {
-    resizable_pm_context_t *ctx = container_of(pmrt_desc, resizable_pm_context_t, pmrt_desc);
-    if (!ctx->resizablert_main.initialized)
-        return;
-
-    if (--ctx->resizablert_main.ref_cnt)
-        return;
-
-    free(ctx->resizablert_main.kvsname);
-    free(ctx->resizablert_main.key_storage);
-    free(ctx->resizablert_main.val_storage);
-
-    PMIR_Finalize();
-
-    memset(ctx, 0, sizeof(*ctx));
-}
-
-static void resizable_pmirt_barrier(pm_rt_desc_t *pmrt_desc) {
-    resizable_pm_context_t *ctx = container_of(pmrt_desc, resizable_pm_context_t, pmrt_desc);
-
-    if (!ctx->resizablert_main.initialized)
-        return;
-
-    PMIR_Barrier();
-}
-
-static atl_status_t resizable_pmirt_kvs_put(pm_rt_desc_t *pmrt_desc,
-                                            char *kvs_key,
-                                            int proc_idx,
-                                            const void *kvs_val,
-                                            size_t kvs_val_len) {
-    int ret;
-    resizable_pm_context_t *ctx = container_of(pmrt_desc, resizable_pm_context_t, pmrt_desc);
-
-    if (!ctx->resizablert_main.initialized)
-        return ATL_STATUS_FAILURE;
-
-    if (kvs_val_len > ctx->resizablert_main.max_vallen)
-        return ATL_STATUS_FAILURE;
-
-    ret = snprintf(ctx->resizablert_main.key_storage,
-                   ctx->resizablert_main.max_keylen - 1,
-                   RESIZABLE_PMI_RT_KEY_FORMAT,
-                   kvs_key,
-                   proc_idx);
-    if (ret < 0)
-        return ATL_STATUS_FAILURE;
-
-    ret = encode(
-        kvs_val, kvs_val_len, ctx->resizablert_main.val_storage, ctx->resizablert_main.max_vallen);
-    if (ret)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_KVS_Put(ctx->resizablert_main.kvsname,
-                       ctx->resizablert_main.key_storage,
-                       ctx->resizablert_main.val_storage);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_KVS_Commit(ctx->resizablert_main.kvsname);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    return ATL_STATUS_SUCCESS;
-}
-
-static atl_status_t resizable_pmirt_kvs_get(pm_rt_desc_t *pmrt_desc,
-                                            char *kvs_key,
-                                            int proc_idx,
-                                            void *kvs_val,
-                                            size_t kvs_val_len) {
-    int ret;
-    resizable_pm_context_t *ctx = container_of(pmrt_desc, resizable_pm_context_t, pmrt_desc);
-
-    if (!ctx->resizablert_main.initialized)
-        return ATL_STATUS_FAILURE;
-
-    ret = snprintf(ctx->resizablert_main.key_storage,
-                   ctx->resizablert_main.max_keylen - 1,
-                   RESIZABLE_PMI_RT_KEY_FORMAT,
-                   kvs_key,
-                   proc_idx);
-    if (ret < 0)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_KVS_Get(ctx->resizablert_main.kvsname,
-                       ctx->resizablert_main.key_storage,
-                       ctx->resizablert_main.val_storage,
-                       ctx->resizablert_main.max_vallen);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    ret = decode(ctx->resizablert_main.val_storage, kvs_val, kvs_val_len);
-    if (ret)
-        return ATL_STATUS_FAILURE;
-
-    return ATL_STATUS_SUCCESS;
-}
-
-static atl_status_t resizable_pmirt_update(int *proc_idx, int *proc_count) {
-    int ret;
-    ret = PMIR_Update();
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-
-    ret = PMIR_Get_size(proc_count);
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-
-    ret = PMIR_Get_rank(proc_idx);
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-
-    return ATL_STATUS_SUCCESS;
-
-err_resizable:
-    PMIR_Finalize();
-    return ATL_STATUS_FAILURE;
-}
-
-atl_status_t resizable_pmirt_wait_notification() {
-    int ret;
-
-    ret = PMIR_Wait_notification();
-
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    return ATL_STATUS_SUCCESS;
-}
-
-pm_rt_ops_t resizable_ops = {
-    .finalize = resizable_pmirt_finalize,
-    .barrier = resizable_pmirt_barrier,
-    .update = resizable_pmirt_update,
-    .wait_notification = resizable_pmirt_wait_notification,
-};
-
-pm_rt_kvs_ops_t resizable_kvs_ops = {
-    .put = resizable_pmirt_kvs_put,
-    .get = resizable_pmirt_kvs_get,
-};
-
-atl_status_t resizable_pmirt_init(int *proc_idx,
-                                  int *proc_count,
-                                  pm_rt_desc_t **pmrt_desc,
-                                  const char *main_addr) {
-    int ret;
-    size_t max_kvsnamelen;
-
-    if (resizable_ctx_singleton.resizablert_main.initialized) {
-        PMIR_Get_size(proc_idx);
-        PMIR_Get_rank(proc_count);
-        *pmrt_desc = &resizable_ctx_singleton.pmrt_desc;
-        resizable_ctx_singleton.resizablert_main.ref_cnt++;
-        return ATL_STATUS_SUCCESS;
-    }
-
-    ret = PMIR_Init(main_addr);
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_Update();
-    if (ret != PMIR_SUCCESS)
-        return ATL_STATUS_FAILURE;
-
-    ret = PMIR_Get_size(proc_count);
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-    ret = PMIR_Get_rank(proc_idx);
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-
-    ret = PMIR_KVS_Get_name_length_max(&max_kvsnamelen);
-    if (ret != PMIR_SUCCESS)
-        goto err_resizable;
-
-    resizable_ctx_singleton.resizablert_main.kvsname = calloc(1, max_kvsnamelen);
-    if (!resizable_ctx_singleton.resizablert_main.kvsname)
-        goto err_resizable;
-
-    ret = PMIR_KVS_Get_my_name(resizable_ctx_singleton.resizablert_main.kvsname, max_kvsnamelen);
-    if (ret != PMIR_SUCCESS)
-        goto err_alloc_key;
-
-    ret = PMIR_KVS_Get_key_length_max(&resizable_ctx_singleton.resizablert_main.max_keylen);
-    if (ret != PMIR_SUCCESS)
-        goto err_alloc_key;
-
-    resizable_ctx_singleton.resizablert_main.key_storage =
-        (char *)calloc(1, resizable_ctx_singleton.resizablert_main.max_keylen);
-    if (!resizable_ctx_singleton.resizablert_main.key_storage)
-        goto err_alloc_key;
-
-    ret = PMIR_KVS_Get_value_length_max(&resizable_ctx_singleton.resizablert_main.max_vallen);
-    if (ret != PMIR_SUCCESS)
-        goto err_alloc_val;
-
-    resizable_ctx_singleton.resizablert_main.val_storage =
-        (char *)calloc(1, resizable_ctx_singleton.resizablert_main.max_vallen);
-    if (!resizable_ctx_singleton.resizablert_main.val_storage)
-        goto err_alloc_val;
-
-    resizable_ctx_singleton.resizablert_main.initialized = 1;
-    resizable_ctx_singleton.resizablert_main.ref_cnt = 1;
-    resizable_ctx_singleton.pmrt_desc.ops = &resizable_ops;
-    resizable_ctx_singleton.pmrt_desc.kvs_ops = &resizable_kvs_ops;
-    *pmrt_desc = &resizable_ctx_singleton.pmrt_desc;
-
-    return ATL_STATUS_SUCCESS;
-err_alloc_val:
-    free(resizable_ctx_singleton.resizablert_main.key_storage);
-err_alloc_key:
-    free(resizable_ctx_singleton.resizablert_main.kvsname);
-err_resizable:
-    PMIR_Finalize();
-    return ATL_STATUS_FAILURE;
-}
-
-atl_status_t resizable_pmirt_main_addr_reserve(char *main_addr) {
-    int ret = PMIR_Main_Addr_Reserve(main_addr);
-
-    if (ret)
-        return ATL_STATUS_FAILURE;
-
-    return ATL_STATUS_SUCCESS;
-}
-
-atl_status_t resizable_pmirt_set_resize_function(atl_resize_fn_t resize_fn) {
-    int ret = PMIR_set_resize_function((pmir_resize_fn_t)resize_fn);
-
-    if (ret)
-        return ATL_STATUS_FAILURE;
-
-    return ATL_STATUS_SUCCESS;
-}
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
index 19c562854..0d8815bfd 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
@@ -35,17 +35,17 @@ pmi_resizable_simple::pmi_resizable_simple(int size,
                                            const char* main_addr)
         : total_rank_count(size),
           ranks(ranks),
-          k(k) {
+          k(k),
+          main_addr(main_addr) {
     max_keylen = MAX_KVS_KEY_LENGTH;
     max_vallen = MAX_KVS_VAL_LENGTH;
-    pmrt_init(main_addr);
 }
 
 int pmi_resizable_simple::is_pm_resize_enabled() {
     return 0;
 }
 
-atl_status_t pmi_resizable_simple::pmrt_init(const char* main_addr) {
+atl_status_t pmi_resizable_simple::pmrt_init() {
     (void)main_addr;
 
     char* kvs_get_timeout_str = getenv("CCL_KVS_GET_TIMEOUT");
@@ -55,31 +55,36 @@ atl_status_t pmi_resizable_simple::pmrt_init(const char* main_addr) {
 
     local_id = 0;
     val_storage = (char*)calloc(1, max_vallen);
-    if (!val_storage)
+    if (!val_storage) {
+        LOG_ERROR("mem alloc failed");
         return ATL_STATUS_FAILURE;
+    }
     /*TODO: add sort, ranks should increase continiusly*/
     if (ranks[0] == 0) {
-        size_t tmp_local_id = get_local_kvs_id();
+        size_t tmp_local_id;
+        ATL_CHECK_STATUS(get_local_kvs_id(tmp_local_id), "failed to get local id");
         tmp_local_id++;
-        set_local_kvs_id(tmp_local_id);
+        ATL_CHECK_STATUS(set_local_kvs_id(tmp_local_id), "failed to set local id");
     }
-    make_requested_info();
+
+    ATL_CHECK_STATUS(make_requested_info(), "failed to make requested info");
     /* extension */
     //    make_map_requested2global();
     /**/
     return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple::make_requested_info() {
-    register_first_rank_idx_and_rank_count();
-    assign_thread_idx_and_fill_ranks_per_thread_map();
+atl_status_t pmi_resizable_simple::make_requested_info() {
+    ATL_CHECK_STATUS(register_first_rank_idx_and_rank_count(), "failed to register ranks");
+    ATL_CHECK_STATUS(assign_thread_idx_and_fill_ranks_per_thread_map(), "failed to fill map");
 
-    local_id = get_local_kvs_id();
-    register_my_proc_name();
-    get_my_proc_idx_and_proc_count();
+    ATL_CHECK_STATUS(get_local_kvs_id(local_id), "failed to get local id");
+    ATL_CHECK_STATUS(register_my_proc_name(), "failed to register proc name");
+    ATL_CHECK_STATUS(get_my_proc_idx_and_proc_count(), "failed to get proc idx");
     calculate_local_thread_idx();
-    remove_initial_data();
-    pmrt_barrier_full();
+    ATL_CHECK_STATUS(remove_initial_data(), "failed to remove initial data");
+    ATL_CHECK_STATUS(pmrt_barrier_full(), "full barrier failed");
+    return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_resizable_simple::pmrt_main_addr_reserve(char* main_addr) {
@@ -98,14 +103,13 @@ atl_status_t pmi_resizable_simple::pmrt_wait_notification() {
     return ATL_STATUS_UNSUPPORTED;
 }
 
-void pmi_resizable_simple::pmrt_finalize() {
+atl_status_t pmi_resizable_simple::pmrt_finalize() {
     is_finalized = true;
     free(val_storage);
 
     if (getenv("CCL_PMI_FORCE_FINALIZE")) {
-        printf("skip pmi_resizable_simple::pmrt_finalize\n");
-        fflush(stdout);
-        return;
+        LOG_WARN("skip pmi_resizable_simple::pmrt_finalize\n");
+        return ATL_STATUS_SUCCESS;
     }
 
     char kvs_name[MAX_KVS_NAME_LENGTH];
@@ -113,63 +117,74 @@ void pmi_resizable_simple::pmrt_finalize() {
     char kvs_val[MAX_KVS_VAL_LENGTH];
 
     while (cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT)) {
-        k->kvs_remove_name_key(kvs_name, kvs_key);
+        KVS_2_ATL_CHECK_STATUS(k->kvs_remove_name_key(kvs_name, kvs_key), "failed to remove info");
     }
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple::pmrt_barrier() {
+atl_status_t pmi_resizable_simple::pmrt_barrier() {
     size_t min_barrier_num;
     char barrier_num_str[INT_STR_SIZE];
 
-    SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
+    ATL_SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    kvs_set_value(KVS_BARRIER, std::to_string(assigned_proc_idx).c_str(), barrier_num_str);
+    ATL_CHECK_STATUS(
+        kvs_set_value(KVS_BARRIER, std::to_string(assigned_proc_idx).c_str(), barrier_num_str),
+        "failed to set barrier num");
 
-    min_barrier_num = get_barrier_idx();
-    while (min_barrier_num != barrier_num) {
-        min_barrier_num = get_barrier_idx();
-    }
+    do {
+        ATL_CHECK_STATUS(get_barrier_idx(min_barrier_num), "failed to get barrier num");
+    } while (min_barrier_num != barrier_num);
 
     barrier_num++;
     if (barrier_num > BARRIER_NUM_MAX)
         barrier_num = 0;
+    return ATL_STATUS_SUCCESS;
 }
-void pmi_resizable_simple::pmrt_barrier_full() {
+atl_status_t pmi_resizable_simple::pmrt_barrier_full() {
     size_t min_barrier_num;
     char barrier_num_str[INT_STR_SIZE];
 
-    SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num_full);
+    ATL_SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num_full);
 
-    kvs_set_value(KVS_BARRIER_FULL, std::to_string(assigned_thread_idx).c_str(), barrier_num_str);
+    ATL_CHECK_STATUS(
+        kvs_set_value(
+            KVS_BARRIER_FULL, std::to_string(assigned_thread_idx).c_str(), barrier_num_str),
+        "failed to set barrier num");
 
-    min_barrier_num = get_barrier_full_idx();
+    ATL_CHECK_STATUS(get_barrier_full_idx(min_barrier_num), "failed to get barrier num");
     while (min_barrier_num != barrier_num) {
-        min_barrier_num = get_barrier_idx();
+        ATL_CHECK_STATUS(get_barrier_idx(min_barrier_num), "failed to get barrier num");
     }
 
     barrier_num_full++;
     if (barrier_num_full > BARRIER_NUM_MAX)
         barrier_num_full = 0;
+    return ATL_STATUS_SUCCESS;
 }
 
-size_t pmi_resizable_simple::get_barrier_full_idx() {
+atl_status_t pmi_resizable_simple::get_barrier_full_idx(size_t& res) {
+    res = 0;
     size_t thread_count = ranks_per_thread_map.size();
 
-    kvs_get_value(KVS_BARRIER_FULL, std::to_string(0).c_str(), val_storage);
+    ATL_CHECK_STATUS(kvs_get_value(KVS_BARRIER_FULL, std::to_string(0).c_str(), val_storage),
+                     "failed to get barrier idx");
 
     size_t min_barrier_idx = atoi(val_storage);
     size_t barrier_idx;
     for (size_t i = 1; i < thread_count; i++) {
-        kvs_get_value(KVS_BARRIER_FULL, std::to_string(i).c_str(), val_storage);
+        ATL_CHECK_STATUS(kvs_get_value(KVS_BARRIER_FULL, std::to_string(i).c_str(), val_storage),
+                         "failed to get barrier idx");
 
         barrier_idx = atoi(val_storage);
 
         if (min_barrier_idx > barrier_idx)
             min_barrier_idx = barrier_idx;
     }
-
-    return min_barrier_idx;
+    res = min_barrier_idx;
+    return ATL_STATUS_SUCCESS;
 }
+
 atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
                                                 int proc_idx,
                                                 const void* kvs_val,
@@ -180,14 +195,18 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
         return ATL_STATUS_FAILURE;
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("sprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
     ret = encode(kvs_val, kvs_val_len, val_storage, max_vallen);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("encode failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    kvs_set_value(KVS_NAME, key_storage, val_storage);
+    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage, val_storage), "failed to set val");
 
     return ATL_STATUS_SUCCESS;
 }
@@ -200,14 +219,18 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_get(char* kvs_key,
     char key_storage[max_keylen];
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("sprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    kvs_get_value(KVS_NAME, key_storage, val_storage);
+    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage), "failed to get val");
 
     ret = decode(val_storage, kvs_val, kvs_val_len);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("encode failed");
         return ATL_STATUS_FAILURE;
+    }
 
     return ATL_STATUS_SUCCESS;
 }
@@ -224,100 +247,118 @@ size_t pmi_resizable_simple::get_local_thread_idx() {
     return local_thread_idx;
 }
 
-int pmi_resizable_simple::kvs_set_value(const char* kvs_name, const char* key, const char* value) {
+atl_status_t pmi_resizable_simple::kvs_set_value(const char* kvs_name,
+                                                 const char* key,
+                                                 const char* value) {
     std::string result_kvs_name = std::string(kvs_name) + std::to_string(local_id);
     put_key(result_kvs_name.c_str(), key, value, ST_CLIENT);
 
-    return k->kvs_set_value(result_kvs_name.c_str(), key, value);
+    return (k->kvs_set_value(result_kvs_name.c_str(), key, value) == KVS_STATUS_SUCCESS)
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
 
-int pmi_resizable_simple::kvs_get_value(const char* kvs_name, const char* key, char* value) {
+atl_status_t pmi_resizable_simple::kvs_get_value(const char* kvs_name,
+                                                 const char* key,
+                                                 char* value) {
     std::string result_kvs_name = std::string(kvs_name) + std::to_string(local_id);
 
     time_t start_time = time(NULL);
     size_t kvs_get_time = 0;
 
-    while (k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value) == 0 &&
-           kvs_get_time < kvs_get_timeout) {
+    do {
+        KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value),
+                               "failed to get value");
         kvs_get_time = time(NULL) - start_time;
-    }
+    } while (strlen(value) == 0 && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
-        printf("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
-               kvs_get_time,
-               kvs_get_timeout,
-               result_kvs_name.c_str(),
-               key);
-        exit(1);
+        LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
+                  kvs_get_time,
+                  kvs_get_timeout,
+                  result_kvs_name.c_str(),
+                  key);
+        return ATL_STATUS_FAILURE;
     }
 
     return ATL_STATUS_SUCCESS;
 }
 
-int pmi_resizable_simple::kvs_iget_value(const char* kvs_name, const char* key, char* value) {
+atl_status_t pmi_resizable_simple::kvs_iget_value(const char* kvs_name,
+                                                  const char* key,
+                                                  char* value) {
     std::string result_kvs_name = std::string(kvs_name) + std::to_string(local_id);
-    return k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value);
+    return k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value) == KVS_STATUS_SUCCESS
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
-size_t pmi_resizable_simple::get_barrier_idx() {
+atl_status_t pmi_resizable_simple::get_barrier_idx(size_t& barrier_num_out) {
     size_t proc_count = threads_per_proc.size();
+    barrier_num_out = 0;
 
-    kvs_get_value(KVS_BARRIER, std::to_string(0).c_str(), val_storage);
+    ATL_CHECK_STATUS(kvs_get_value(KVS_BARRIER, std::to_string(0).c_str(), val_storage),
+                     "failed to get barrier");
 
     size_t min_barrier_idx = atoi(val_storage);
     size_t barrier_idx;
     for (size_t i = 1; i < proc_count; i++) {
-        kvs_get_value(KVS_BARRIER, std::to_string(i).c_str(), val_storage);
-
+        ATL_CHECK_STATUS(kvs_get_value(KVS_BARRIER, std::to_string(i).c_str(), val_storage),
+                         "failed to get barrier");
         barrier_idx = atoi(val_storage);
 
         if (min_barrier_idx > barrier_idx)
             min_barrier_idx = barrier_idx;
     }
 
-    return min_barrier_idx;
+    barrier_num_out = min_barrier_idx;
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple::register_first_rank_idx_and_rank_count() {
-    kvs_set_value(
+atl_status_t pmi_resizable_simple::register_first_rank_idx_and_rank_count() {
+    return kvs_set_value(
         RANKS_PER_THREAD, std::to_string(ranks[0]).c_str(), std::to_string(ranks.size()).c_str());
 }
 
-void pmi_resizable_simple::assign_thread_idx_and_fill_ranks_per_thread_map() {
+atl_status_t pmi_resizable_simple::assign_thread_idx_and_fill_ranks_per_thread_map() {
     int rank_count = 0;
     int ranks_per_thread;
     while (rank_count < total_rank_count) {
         if (rank_count == ranks[0]) {
             assigned_thread_idx = ranks_per_thread_map.size();
         }
-        kvs_get_value(RANKS_PER_THREAD, std::to_string(rank_count).c_str(), val_storage);
+        ATL_CHECK_STATUS(
+            kvs_get_value(RANKS_PER_THREAD, std::to_string(rank_count).c_str(), val_storage),
+            "failed to get ranks");
 
-        ranks_per_thread = safe_strtol(val_storage, NULL, 10);
+        ranks_per_thread = std::atoi(val_storage);
         ranks_per_thread_map.push_back(ranks_per_thread);
         rank_count += ranks_per_thread;
     }
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple::register_my_proc_name() {
+atl_status_t pmi_resizable_simple::register_my_proc_name() {
     int my_pid = getpid();
     const int hostname_len = 1024;
     char hostname[hostname_len];
     int ret = gethostname(hostname, hostname_len);
     if (ret) {
-        printf("gethostname error: %s\n", strerror(errno));
-        exit(EXIT_FAILURE);
+        LOG_ERROR("gethostname error: %s\n", strerror(errno));
+        return ATL_STATUS_FAILURE;
     }
     my_proccess_name = std::string(hostname) + std::to_string(my_pid);
 
-    kvs_set_value(
+    return kvs_set_value(
         PROCESS_THREAD_NAME, std::to_string(assigned_thread_idx).c_str(), my_proccess_name.c_str());
 }
 
-void pmi_resizable_simple::get_my_proc_idx_and_proc_count() {
+atl_status_t pmi_resizable_simple::get_my_proc_idx_and_proc_count() {
     std::map<std::string, int> proc_name_to_rank;
     std::map<std::string, int>::iterator it;
     int rank;
     for (size_t i = 0; i < ranks_per_thread_map.size(); i++) {
-        kvs_get_value(PROCESS_THREAD_NAME, std::to_string(i).c_str(), val_storage);
+        ATL_CHECK_STATUS(kvs_get_value(PROCESS_THREAD_NAME, std::to_string(i).c_str(), val_storage),
+                         "failed to get proc name");
 
         it = proc_name_to_rank.find(val_storage);
         if (it == proc_name_to_rank.end()) {
@@ -325,9 +366,10 @@ void pmi_resizable_simple::get_my_proc_idx_and_proc_count() {
             if (!my_proccess_name.compare(val_storage)) {
                 assigned_proc_idx = rank;
                 if (assigned_thread_idx == i) {
-                    kvs_set_value(REQUESTED_RANK_TO_NAME,
-                                  std::to_string(assigned_proc_idx).c_str(),
-                                  my_proccess_name.c_str());
+                    ATL_CHECK_STATUS(kvs_set_value(REQUESTED_RANK_TO_NAME,
+                                                   std::to_string(assigned_proc_idx).c_str(),
+                                                   my_proccess_name.c_str()),
+                                     "failed to set proc name");
                 }
             }
             proc_name_to_rank[val_storage] = rank;
@@ -337,6 +379,7 @@ void pmi_resizable_simple::get_my_proc_idx_and_proc_count() {
             threads_per_proc[it->second].push_back(i);
         }
     }
+    return ATL_STATUS_SUCCESS;
 }
 
 void pmi_resizable_simple::calculate_local_thread_idx() {
@@ -350,55 +393,78 @@ void pmi_resizable_simple::calculate_local_thread_idx() {
     }
 }
 
-void pmi_resizable_simple::make_map_requested2global() {
+atl_status_t pmi_resizable_simple::make_map_requested2global() {
     char global_rank_str[MAX_KVS_VAL_LENGTH];
     char process_name[MAX_KVS_VAL_LENGTH];
     size_t size = get_size();
     requested2global.resize(size);
-    pmrt_barrier_full();
+    ATL_CHECK_STATUS(pmrt_barrier_full(), "make_map_requested2global: full barrier failed");
     for (size_t i = 0; i < size; i++) {
-        kvs_get_value(REQUESTED_RANK_TO_NAME, std::to_string(i).c_str(), process_name);
-        if (kvs_iget_value(GLOBAL_NAME_TO_RANK, process_name, global_rank_str) == 0) {
+        ATL_CHECK_STATUS(
+            kvs_get_value(REQUESTED_RANK_TO_NAME, std::to_string(i).c_str(), process_name),
+            "make_map_requested2global: failed to get proc name");
+        ATL_CHECK_STATUS(kvs_iget_value(GLOBAL_NAME_TO_RANK, process_name, global_rank_str),
+                         "make_map_requested2global: failed to get glob rank");
+        if (strlen(global_rank_str) == 0) {
             if (!my_proccess_name.compare(process_name)) {
                 int free_glob_rank = 0;
-                while (kvs_iget_value(GLOBAL_RANK_TO_NAME,
-                                      std::to_string(free_glob_rank).c_str(),
-                                      process_name) != 0) {
+                ATL_CHECK_STATUS(
+                    kvs_iget_value(
+                        GLOBAL_RANK_TO_NAME, std::to_string(free_glob_rank).c_str(), process_name),
+                    "make_map_requested2global: failed to get proc name");
+                while (strlen(process_name) != 0) {
                     free_glob_rank++;
+                    ATL_CHECK_STATUS(kvs_iget_value(GLOBAL_RANK_TO_NAME,
+                                                    std::to_string(free_glob_rank).c_str(),
+                                                    process_name),
+                                     "make_map_requested2global: failed to get proc name");
                 }
-                kvs_set_value(GLOBAL_RANK_TO_NAME,
-                              std::to_string(free_glob_rank).c_str(),
-                              my_proccess_name.c_str());
-                kvs_set_value(GLOBAL_NAME_TO_RANK,
-                              my_proccess_name.c_str(),
-                              std::to_string(free_glob_rank).c_str());
+                ATL_CHECK_STATUS(kvs_set_value(GLOBAL_RANK_TO_NAME,
+                                               std::to_string(free_glob_rank).c_str(),
+                                               my_proccess_name.c_str()),
+                                 "make_map_requested2global: failed to set proc name");
+                ATL_CHECK_STATUS(kvs_set_value(GLOBAL_NAME_TO_RANK,
+                                               my_proccess_name.c_str(),
+                                               std::to_string(free_glob_rank).c_str()),
+                                 "make_map_requested2global: failed to set free rank info");
             }
-            kvs_get_value(GLOBAL_NAME_TO_RANK, process_name, global_rank_str);
+            ATL_CHECK_STATUS(kvs_get_value(GLOBAL_NAME_TO_RANK, process_name, global_rank_str),
+                             "make_map_requested2global: failed to get rank info");
         }
         requested2global[i] = atoi(global_rank_str);
     }
-    pmrt_barrier_full();
+    ATL_CHECK_STATUS(pmrt_barrier_full(), "make_map_requested2global: full barrier failed");
+    return ATL_STATUS_SUCCESS;
 }
 
-size_t pmi_resizable_simple::get_local_kvs_id() {
+atl_status_t pmi_resizable_simple::get_local_kvs_id(size_t& res) {
     char local_kvs_id[MAX_KVS_VAL_LENGTH];
+    res = 0;
     /*TODO: change it for collect local_per_rank id, not global*/
-    if (k->kvs_get_value_by_name_key(LOCAL_KVS_ID, "ID", local_kvs_id) == 0)
-        return 0;
-    return atoi(local_kvs_id);
+    KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(LOCAL_KVS_ID, "ID", local_kvs_id),
+                           "failed to get local kvs id");
+    res = atoi(local_kvs_id);
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple::set_local_kvs_id(size_t local_kvs_id) {
+atl_status_t pmi_resizable_simple::set_local_kvs_id(size_t local_kvs_id) {
     /*TODO: change it for collect local_per_rank id, not global*/
     put_key(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str(), ST_CLIENT);
-    k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str());
+    return (k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str()) ==
+            KVS_STATUS_SUCCESS)
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
 pmi_resizable_simple::~pmi_resizable_simple() {
-    if (!is_finalized)
-        pmrt_finalize();
+    if (!is_finalized) {
+        CCL_THROW_IF_NOT(pmrt_finalize() == ATL_STATUS_SUCCESS, "~pmi_resizable_simple: failed");
+    }
 }
-void pmi_resizable_simple::remove_initial_data() {
+atl_status_t pmi_resizable_simple::remove_initial_data() {
     std::string result_kvs_name = std::string(RANKS_PER_THREAD) + std::to_string(0);
     remove_val(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str(), ST_CLIENT);
-    k->kvs_remove_name_key(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str());
+    return k->kvs_remove_name_key(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str()) ==
+                   KVS_STATUS_SUCCESS
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
index 3475bd2d9..8bb255883 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
@@ -45,7 +45,7 @@ class pmi_resizable_simple final : public ipmi {
     pmi_resizable_simple(int total_rank_count,
                          const std::vector<int>& ranks,
                          std::shared_ptr<ikvs_wrapper> k,
-                         const char* main_addr = nullptr);
+                         const char* main_addr = "");
 
     ~pmi_resizable_simple() override;
 
@@ -59,7 +59,7 @@ class pmi_resizable_simple final : public ipmi {
 
     atl_status_t pmrt_wait_notification() override;
 
-    void pmrt_barrier() override;
+    atl_status_t pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char* kvs_key,
                               int proc_idx,
@@ -77,9 +77,9 @@ class pmi_resizable_simple final : public ipmi {
 
     size_t get_local_thread_idx() override;
 
-    size_t get_local_kvs_id() override;
+    atl_status_t get_local_kvs_id(size_t& res) override;
 
-    void set_local_kvs_id(size_t local_kvs_id) override;
+    atl_status_t set_local_kvs_id(size_t local_kvs_id) override;
 
     size_t get_threads_per_process() override {
         return threads_per_proc[assigned_proc_idx].size();
@@ -94,28 +94,29 @@ class pmi_resizable_simple final : public ipmi {
         return res;
     }
 
-    void pmrt_finalize() override;
+    atl_status_t pmrt_finalize() override;
+
+    atl_status_t pmrt_init() override;
 
 private:
     bool is_finalized{ false };
-    atl_status_t pmrt_init(const char* main_addr = nullptr);
 
-    int kvs_set_value(const char* kvs_name, const char* key, const char* value);
-    int kvs_get_value(const char* kvs_name, const char* key, char* value);
-    int kvs_iget_value(const char* kvs_name, const char* key, char* value);
+    atl_status_t kvs_set_value(const char* kvs_name, const char* key, const char* value);
+    atl_status_t kvs_get_value(const char* kvs_name, const char* key, char* value);
+    atl_status_t kvs_iget_value(const char* kvs_name, const char* key, char* value);
 
-    size_t get_barrier_idx();
-    size_t get_barrier_full_idx();
+    atl_status_t get_barrier_idx(size_t& barrier_num_out);
+    atl_status_t get_barrier_full_idx(size_t& res);
 
     void calculate_local_thread_idx();
-    void register_first_rank_idx_and_rank_count();
-    void assign_thread_idx_and_fill_ranks_per_thread_map();
-    void register_my_proc_name();
-    void get_my_proc_idx_and_proc_count();
-    void make_requested_info();
-    void remove_initial_data();
-    void make_map_requested2global();
-    void pmrt_barrier_full();
+    atl_status_t register_first_rank_idx_and_rank_count();
+    atl_status_t assign_thread_idx_and_fill_ranks_per_thread_map();
+    atl_status_t register_my_proc_name();
+    atl_status_t get_my_proc_idx_and_proc_count();
+    atl_status_t make_requested_info();
+    atl_status_t remove_initial_data();
+    atl_status_t make_map_requested2global();
+    atl_status_t pmrt_barrier_full();
 
     int total_rank_count;
     int assigned_proc_idx;
@@ -127,6 +128,7 @@ class pmi_resizable_simple final : public ipmi {
     std::vector<size_t> ranks_per_thread_map;
     std::map<size_t, std::list<size_t>> threads_per_proc;
     std::shared_ptr<ikvs_wrapper> k;
+    std::string main_addr;
     size_t max_keylen;
     size_t max_vallen;
     char* val_storage = nullptr;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
index 1975d368e..0397f54d4 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
@@ -38,17 +38,17 @@ pmi_resizable_simple_internal::pmi_resizable_simple_internal(int size,
                                                              const char* main_addr)
         : total_rank_count(size),
           ranks(ranks),
-          k(k) {
+          k(k),
+          main_addr(main_addr) {
     max_keylen = MAX_KVS_KEY_LENGTH;
     max_vallen = MAX_KVS_VAL_LENGTH;
-    pmrt_init(main_addr);
 }
 
 int pmi_resizable_simple_internal::is_pm_resize_enabled() {
     return 0;
 }
 
-atl_status_t pmi_resizable_simple_internal::pmrt_init(const char* main_addr) {
+atl_status_t pmi_resizable_simple_internal::pmrt_init() {
     (void)main_addr;
 
     char* kvs_get_timeout_str = getenv("CCL_KVS_GET_TIMEOUT");
@@ -58,26 +58,29 @@ atl_status_t pmi_resizable_simple_internal::pmrt_init(const char* main_addr) {
 
     local_id = 0;
     val_storage = (char*)calloc(1, max_vallen);
-    if (!val_storage)
+    if (!val_storage) {
+        LOG_ERROR("mem alloc failed");
         return ATL_STATUS_FAILURE;
-    local_id = get_local_kvs_id();
-    barrier_full_reg();
+    }
+    ATL_CHECK_STATUS(get_local_kvs_id(local_id), "failed to get local id");
+    ATL_CHECK_STATUS(barrier_full_reg(), "failed to full_barrier info register");
 
-    registration();
+    ATL_CHECK_STATUS(registration(), "registration failed");
 
     if (ranks[0] == 0) {
-        size_t tmp_local_id = get_local_kvs_id();
+        size_t tmp_local_id;
+        ATL_CHECK_STATUS(get_local_kvs_id(tmp_local_id), "failed to get local id");
         tmp_local_id++;
-        set_local_kvs_id(tmp_local_id);
+        ATL_CHECK_STATUS(set_local_kvs_id(tmp_local_id), "failed to set local id");
     }
     if (thread_num == 0) {
-        barrier_reg();
+        ATL_CHECK_STATUS(barrier_reg(), "failed to barrier info register");
     }
 
     return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple_internal::registration() {
+atl_status_t pmi_resizable_simple_internal::registration() {
     std::string total_local_rank_count_str = std::to_string(total_rank_count);
     std::string result_kvs_name = std::string(INTERNAL_REGISTRATION) + std::to_string(local_id);
     memset(val_storage, 0, max_vallen);
@@ -88,10 +91,14 @@ void pmi_resizable_simple_internal::registration() {
              ranks[0],
              getpid(),
              gettid());
-    k->kvs_set_size(
-        result_kvs_name.c_str(), result_kvs_name.c_str(), total_local_rank_count_str.c_str());
+    KVS_2_ATL_CHECK_STATUS(
+        k->kvs_set_size(
+            result_kvs_name.c_str(), result_kvs_name.c_str(), total_local_rank_count_str.c_str()),
+        "failed to set total rank count");
     /*return string: %PROC_COUNT%_%RANK_NUM%_%PROCESS_RANK_COUNT%_%THREADS_COUNT%_%THREAD_NUM% */
-    k->kvs_register(result_kvs_name.c_str(), result_kvs_name.c_str(), val_storage);
+    KVS_2_ATL_CHECK_STATUS(
+        k->kvs_register(result_kvs_name.c_str(), result_kvs_name.c_str(), val_storage),
+        "failed to register");
 
     char* proc_count_str = val_storage;
     char* rank_str = strstr(proc_count_str, "_");
@@ -112,53 +119,63 @@ void pmi_resizable_simple_internal::registration() {
     proc_rank_count = std::stoi(proc_rank_count_str);
     threads_count = std::stoi(threads_count_str);
     thread_num = std::stoi(thread_num_str);
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple_internal::barrier_full_reg() {
+atl_status_t pmi_resizable_simple_internal::barrier_full_reg() {
     std::string empty_line("");
     std::string total_local_rank_count_str =
         std::to_string(total_rank_count) + "_" + std::to_string(ranks.size());
     std::string result_kvs_name = std::string(KVS_BARRIER_FULL) + std::to_string(local_id);
 
-    k->kvs_barrier_register(
-        result_kvs_name.c_str(), result_kvs_name.c_str(), total_local_rank_count_str.c_str());
-    pmrt_barrier_full();
+    KVS_2_ATL_CHECK_STATUS(
+        k->kvs_barrier_register(
+            result_kvs_name.c_str(), result_kvs_name.c_str(), total_local_rank_count_str.c_str()),
+        "registration failed");
+    ATL_CHECK_STATUS(pmrt_barrier_full(), "full barrier failed");
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple_internal::barrier_reg() {
+atl_status_t pmi_resizable_simple_internal::barrier_reg() {
     std::string empty_line("");
     std::string proc_count_str = std::to_string(proc_count);
     std::string result_kvs_name = std::string(KVS_BARRIER) + std::to_string(local_id);
 
-    k->kvs_barrier_register(
-        result_kvs_name.c_str(), result_kvs_name.c_str(), proc_count_str.c_str());
-    pmrt_barrier_full();
+    KVS_2_ATL_CHECK_STATUS(
+        k->kvs_barrier_register(
+            result_kvs_name.c_str(), result_kvs_name.c_str(), proc_count_str.c_str()),
+        "registration failed");
+    ATL_CHECK_STATUS(pmrt_barrier_full(), "full barrier failed");
+    return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_resizable_simple_internal::pmrt_main_addr_reserve(char* main_addr) {
+    LOG_ERROR("unsupported");
     return ATL_STATUS_UNSUPPORTED;
 }
 
 atl_status_t pmi_resizable_simple_internal::pmrt_set_resize_function(atl_resize_fn_t resize_fn) {
+    LOG_ERROR("unsupported");
     return ATL_STATUS_UNSUPPORTED;
 }
 
 atl_status_t pmi_resizable_simple_internal::pmrt_update() {
+    LOG_ERROR("unsupported");
     return ATL_STATUS_UNSUPPORTED;
 }
 
 atl_status_t pmi_resizable_simple_internal::pmrt_wait_notification() {
+    LOG_ERROR("unsupported");
     return ATL_STATUS_UNSUPPORTED;
 }
 
-void pmi_resizable_simple_internal::pmrt_finalize() {
+atl_status_t pmi_resizable_simple_internal::pmrt_finalize() {
     is_finalized = true;
     free(val_storage);
 
     if (getenv("CCL_PMI_FORCE_FINALIZE")) {
-        printf("skip pmi_resizable_simple::pmrt_finalize\n");
-        fflush(stdout);
-        return;
+        LOG_WARN("skip pmi_resizable_simple::pmrt_finalize\n");
+        return ATL_STATUS_SUCCESS;
     }
 
     char kvs_name[MAX_KVS_NAME_LENGTH];
@@ -166,22 +183,29 @@ void pmi_resizable_simple_internal::pmrt_finalize() {
     char kvs_val[MAX_KVS_VAL_LENGTH];
 
     while (cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT)) {
-        k->kvs_remove_name_key(kvs_name, kvs_key);
+        KVS_2_ATL_CHECK_STATUS(k->kvs_remove_name_key(kvs_name, kvs_key), "failed to remove info");
     }
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple_internal::pmrt_barrier() {
+atl_status_t pmi_resizable_simple_internal::pmrt_barrier() {
     std::string empty_line("");
     std::string result_kvs_name = std::string(KVS_BARRIER) + std::to_string(local_id);
 
-    k->kvs_barrier(result_kvs_name.c_str(), result_kvs_name.c_str(), empty_line.c_str());
+    return k->kvs_barrier(result_kvs_name.c_str(), result_kvs_name.c_str(), empty_line.c_str()) ==
+                   KVS_STATUS_SUCCESS
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
 
-void pmi_resizable_simple_internal::pmrt_barrier_full() {
+atl_status_t pmi_resizable_simple_internal::pmrt_barrier_full() {
     std::string empty_line("");
     std::string result_kvs_name = std::string(KVS_BARRIER_FULL) + std::to_string(local_id);
 
-    k->kvs_barrier(result_kvs_name.c_str(), result_kvs_name.c_str(), (empty_line.c_str()));
+    return k->kvs_barrier(result_kvs_name.c_str(), result_kvs_name.c_str(), (empty_line.c_str())) ==
+                   KVS_STATUS_SUCCESS
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_resizable_simple_internal::pmrt_kvs_put(char* kvs_key,
@@ -190,18 +214,24 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_put(char* kvs_key,
                                                          size_t kvs_val_len) {
     int ret;
     char key_storage[max_keylen];
-    if (kvs_val_len > max_vallen)
+    if (kvs_val_len > max_vallen) {
+        LOG_ERROR("asked len > max len");
         return ATL_STATUS_FAILURE;
+    }
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
     ret = encode(kvs_val, kvs_val_len, val_storage, max_vallen);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("encode failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    kvs_set_value(KVS_NAME, key_storage, val_storage);
+    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage, val_storage), "failed to set val");
 
     return ATL_STATUS_SUCCESS;
 }
@@ -214,14 +244,18 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_get(char* kvs_key,
     char key_storage[max_keylen];
 
     ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
-    if (ret < 0)
+    if (ret < 0) {
+        LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
+    }
 
-    kvs_get_value(KVS_NAME, key_storage, val_storage);
+    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage), "failed to get val");
 
     ret = decode(val_storage, kvs_val, kvs_val_len);
-    if (ret)
+    if (ret) {
+        LOG_ERROR("decode failed");
         return ATL_STATUS_FAILURE;
+    }
 
     return ATL_STATUS_SUCCESS;
 }
@@ -255,46 +289,53 @@ int pmi_resizable_simple_internal::kvs_set_value(const char* kvs_name,
     return k->kvs_set_value(result_kvs_name.c_str(), key, value);
 }
 
-int pmi_resizable_simple_internal::kvs_get_value(const char* kvs_name,
-                                                 const char* key,
-                                                 char* value) {
+atl_status_t pmi_resizable_simple_internal::kvs_get_value(const char* kvs_name,
+                                                          const char* key,
+                                                          char* value) {
     std::string result_kvs_name = std::string(kvs_name) + std::to_string(local_id);
 
     time_t start_time = time(NULL);
     size_t kvs_get_time = 0;
 
-    while (k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value) == 0 &&
-           kvs_get_time < kvs_get_timeout) {
+    do {
+        KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value),
+                               "failed to get value");
         kvs_get_time = time(NULL) - start_time;
-    }
+    } while (strlen(value) == 0 && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
-        printf("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
-               kvs_get_time,
-               kvs_get_timeout,
-               result_kvs_name.c_str(),
-               key);
-        exit(1);
+        LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
+                  kvs_get_time,
+                  kvs_get_timeout,
+                  result_kvs_name.c_str(),
+                  key);
+        return ATL_STATUS_FAILURE;
     }
-
     return ATL_STATUS_SUCCESS;
 }
 
-size_t pmi_resizable_simple_internal::get_local_kvs_id() {
+atl_status_t pmi_resizable_simple_internal::get_local_kvs_id(size_t& res) {
     char local_kvs_id[MAX_KVS_VAL_LENGTH];
+    res = 0;
     /*TODO: change it for collect local_per_rank id, not global*/
-    if (k->kvs_get_value_by_name_key(LOCAL_KVS_ID, "ID", local_kvs_id) == 0)
-        return 0;
-    return atoi(local_kvs_id);
+    KVS_2_ATL_CHECK_STATUS(k->kvs_get_value_by_name_key(LOCAL_KVS_ID, "ID", local_kvs_id),
+                           "failed to get local kvs id");
+    res = atoi(local_kvs_id);
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_resizable_simple_internal::set_local_kvs_id(size_t local_kvs_id) {
+atl_status_t pmi_resizable_simple_internal::set_local_kvs_id(size_t local_kvs_id) {
     /*TODO: change it for collect local_per_rank id, not global*/
     put_key(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str(), ST_CLIENT);
-    k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str());
+    return k->kvs_set_value(LOCAL_KVS_ID, "ID", std::to_string(local_kvs_id).c_str()) ==
+                   KVS_STATUS_SUCCESS
+               ? ATL_STATUS_SUCCESS
+               : ATL_STATUS_FAILURE;
 }
 
 pmi_resizable_simple_internal::~pmi_resizable_simple_internal() {
-    if (!is_finalized)
-        pmrt_finalize();
+    if (!is_finalized) {
+        CCL_THROW_IF_NOT(pmrt_finalize() == ATL_STATUS_SUCCESS,
+                         "~pmi_resizable_simple_internal: failed");
+    }
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
index 566e5b371..00ed1ac33 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
@@ -45,7 +45,7 @@ class pmi_resizable_simple_internal final : public ipmi {
     pmi_resizable_simple_internal(int total_rank_count,
                                   const std::vector<int>& ranks,
                                   std::shared_ptr<internal_kvs> k,
-                                  const char* main_addr = nullptr);
+                                  const char* main_addr = "");
 
     ~pmi_resizable_simple_internal() override;
 
@@ -59,7 +59,7 @@ class pmi_resizable_simple_internal final : public ipmi {
 
     atl_status_t pmrt_wait_notification() override;
 
-    void pmrt_barrier() override;
+    atl_status_t pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char* kvs_key,
                               int proc_idx,
@@ -77,27 +77,28 @@ class pmi_resizable_simple_internal final : public ipmi {
 
     size_t get_local_thread_idx() override;
 
-    size_t get_local_kvs_id() override;
+    atl_status_t get_local_kvs_id(size_t& res) override;
 
-    void set_local_kvs_id(size_t local_kvs_id) override;
+    atl_status_t set_local_kvs_id(size_t local_kvs_id) override;
 
     size_t get_threads_per_process() override;
 
     size_t get_ranks_per_process() override;
 
-    void pmrt_finalize() override;
+    atl_status_t pmrt_finalize() override;
+
+    atl_status_t pmrt_init() override;
 
 private:
     bool is_finalized{ false };
-    atl_status_t pmrt_init(const char* main_addr = nullptr);
 
     int kvs_set_value(const char* kvs_name, const char* key, const char* value);
-    int kvs_get_value(const char* kvs_name, const char* key, char* value);
+    atl_status_t kvs_get_value(const char* kvs_name, const char* key, char* value);
 
-    void pmrt_barrier_full();
-    void barrier_full_reg();
-    void barrier_reg();
-    void registration();
+    atl_status_t pmrt_barrier_full();
+    atl_status_t barrier_full_reg();
+    atl_status_t barrier_reg();
+    atl_status_t registration();
 
     int proc_count = 0;
     int rank = 0;
@@ -109,6 +110,7 @@ class pmi_resizable_simple_internal final : public ipmi {
 
     std::vector<int> ranks;
     std::shared_ptr<internal_kvs> k;
+    std::string main_addr;
     size_t max_keylen;
     size_t max_vallen;
     char* val_storage = nullptr;
diff --git a/src/atl/util/pm/pmi_rt/pmi_simple.cpp b/src/atl/util/pm/pmi_rt/pmi_simple.cpp
index 6e14b8529..9526f900a 100644
--- a/src/atl/util/pm/pmi_rt/pmi_simple.cpp
+++ b/src/atl/util/pm/pmi_rt/pmi_simple.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "common/log/log.hpp"
 #include "pmi_simple.h"
 #include "pmi_rt.c"
 
@@ -20,37 +21,41 @@ int pmi_simple::is_pm_resize_enabled() {
     return false;
 }
 
-pmi_simple::pmi_simple() {
-    pmirt_init(&rank, &size, &pmrt_desc);
+pmi_simple::pmi_simple() {}
+
+atl_status_t pmi_simple::pmrt_init() {
+    return pmirt_init(&rank, &size, &pmrt_desc);
 }
 
 atl_status_t pmi_simple::pmrt_main_addr_reserve(char *main_addr) {
-    printf("Function main_addr_reserv unsupported yet for simple pmi\n");
+    LOG_ERROR("Function main_addr_reserv unsupported yet for simple pmi\n");
     return ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_simple::pmrt_set_resize_function(atl_resize_fn_t resize_fn) {
-    printf("Function set_resize_function unsupported yet for simple pmi\n");
+    LOG_ERROR("Function set_resize_function unsupported yet for simple pmi\n");
     return ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_simple::pmrt_update() {
-    printf("Function update unsupported yet for simple pmi\n");
+    LOG_ERROR("Function update unsupported yet for simple pmi\n");
     return ATL_STATUS_FAILURE;
 }
 
 atl_status_t pmi_simple::pmrt_wait_notification() {
-    printf("Function wait_notification unsupported yet for simple pmi\n");
+    LOG_ERROR("Function wait_notification unsupported yet for simple pmi\n");
     return ATL_STATUS_FAILURE;
 }
 
-void pmi_simple::pmrt_finalize() {
+atl_status_t pmi_simple::pmrt_finalize() {
     is_finalized = true;
     pmirt_finalize(pmrt_desc);
+    return ATL_STATUS_SUCCESS;
 }
 
-void pmi_simple::pmrt_barrier() {
+atl_status_t pmi_simple::pmrt_barrier() {
     pmirt_barrier(pmrt_desc);
+    return ATL_STATUS_SUCCESS;
 }
 
 atl_status_t pmi_simple::pmrt_kvs_put(char *kvs_key,
@@ -78,12 +83,16 @@ int pmi_simple::get_size() {
 size_t pmi_simple::get_local_thread_idx() {
     return 0;
 }
-size_t pmi_simple::get_local_kvs_id() {
-    return 0;
+atl_status_t pmi_simple::get_local_kvs_id(size_t &res) {
+    res = 0;
+    return ATL_STATUS_SUCCESS;
+}
+atl_status_t pmi_simple::set_local_kvs_id(size_t local_kvs_id) {
+    return ATL_STATUS_SUCCESS;
 }
-void pmi_simple::set_local_kvs_id(size_t local_kvs_id) {}
 
 pmi_simple::~pmi_simple() {
-    if (!is_finalized)
-        pmrt_finalize();
+    if (!is_finalized) {
+        CCL_THROW_IF_NOT(pmrt_finalize() == ATL_STATUS_SUCCESS, "~pmi_simple: failed");
+    }
 }
diff --git a/src/atl/util/pm/pmi_rt/pmi_simple.h b/src/atl/util/pm/pmi_rt/pmi_simple.h
index 27d8b0571..8ce68407b 100644
--- a/src/atl/util/pm/pmi_rt/pmi_simple.h
+++ b/src/atl/util/pm/pmi_rt/pmi_simple.h
@@ -31,9 +31,9 @@ class pmi_simple final : public ipmi {
 
     atl_status_t pmrt_wait_notification() override;
 
-    void pmrt_finalize() override;
+    atl_status_t pmrt_finalize() override;
 
-    void pmrt_barrier() override;
+    atl_status_t pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char *kvs_key,
                               int proc_idx,
@@ -51,9 +51,9 @@ class pmi_simple final : public ipmi {
 
     size_t get_local_thread_idx() override;
 
-    size_t get_local_kvs_id() override;
+    atl_status_t get_local_kvs_id(size_t &res) override;
 
-    void set_local_kvs_id(size_t local_kvs_id) override;
+    atl_status_t set_local_kvs_id(size_t local_kvs_id) override;
 
     size_t get_threads_per_process() override {
         return 1;
@@ -63,6 +63,8 @@ class pmi_simple final : public ipmi {
         return 1;
     }
 
+    atl_status_t pmrt_init() override;
+
 private:
     int rank;
     int size;
diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp
index 4ae113e2b..1a48173cd 100644
--- a/src/ccl_api_functions.cpp
+++ b/src/ccl_api_functions.cpp
@@ -16,12 +16,12 @@
 #include "oneapi/ccl/types.hpp"
 #include "oneapi/ccl/environment.hpp"
 #include "oneapi/ccl/api_functions.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
+#include "common/comm/comm.hpp"
 #include "oneapi/ccl/exception.hpp"
 
-#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 #include "common/comm/comm_interface.hpp"
-#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#endif //#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 
 #include "ccl_api_functions_generators.hpp"
 #include "common/global/global.hpp"
diff --git a/src/ccl_cpp_communicator.cpp b/src/ccl_cpp_communicator.cpp
index 7e608b59b..af93c5dac 100644
--- a/src/ccl_cpp_communicator.cpp
+++ b/src/ccl_cpp_communicator.cpp
@@ -46,14 +46,12 @@
 #include "oneapi/ccl/event.hpp"
 
 #include "oneapi/ccl/communicator.hpp"
-#include "common/comm/l0/comm_context_storage.hpp"
 
 #include "common/global/global.hpp"
 
 //TODO
 #include "common/comm/comm.hpp"
 
-#include "common/comm/l0/comm_context.hpp"
 #include "communicator_impl.hpp"
 
 namespace ccl {
diff --git a/src/ccl_cpp_environment.cpp b/src/ccl_cpp_environment.cpp
index 7c7297170..37fb94813 100644
--- a/src/ccl_cpp_environment.cpp
+++ b/src/ccl_cpp_environment.cpp
@@ -18,10 +18,9 @@
 #include "exec/exec.hpp"
 #include "common/utils/version.hpp"
 
-#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-#include "common/comm/l0/comm_context.hpp"
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 #include "common/comm/comm_interface.hpp"
-#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#endif //#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 
 #include <memory>
 
diff --git a/src/ccl_cpp_kvs.cpp b/src/ccl_cpp_kvs.cpp
index 600e523c3..765f9f36b 100644
--- a/src/ccl_cpp_kvs.cpp
+++ b/src/ccl_cpp_kvs.cpp
@@ -53,8 +53,10 @@ kvs::address_type kvs_impl::get_addr() {
 }
 
 vector_class<char> kvs_impl::get(const string_class& key) {
-    char ret[128];
-    inter_kvs->kvs_get_value_by_name_key(prefix.c_str(), key.c_str(), ret);
+    char ret[MAX_KVS_VAL_LENGTH];
+    CCL_THROW_IF_NOT(inter_kvs->kvs_get_value_by_name_key(prefix.c_str(), key.c_str(), ret) ==
+                         KVS_STATUS_SUCCESS,
+                     "kvs get failed");
     size_t ret_len = strlen(ret);
     vector_class<char> ret_vec;
     if (ret_len != 0) {
diff --git a/src/coll/algorithms/algorithm_utils.cpp b/src/coll/algorithms/algorithm_utils.cpp
index 48b5c00cf..04ecd4525 100644
--- a/src/coll/algorithms/algorithm_utils.cpp
+++ b/src/coll/algorithms/algorithm_utils.cpp
@@ -13,16 +13,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "coll/algorithms/algorithms_enum.hpp"
+#include <algorithm>
+#include <numeric>
+#include <sstream>
 
-bool ccl_coll_type_is_reduction(ccl_coll_type ctype) {
-    switch (ctype) {
-        case ccl_coll_allreduce:
-        case ccl_coll_reduce:
-        case ccl_coll_reduce_scatter: return true;
-        default: return false;
-    }
-}
+#include "coll/algorithms/algorithm_utils.hpp"
+#include "common/log/log.hpp"
 
 const char* ccl_coll_type_to_str(ccl_coll_type type) {
     switch (type) {
@@ -35,9 +31,56 @@ const char* ccl_coll_type_to_str(ccl_coll_type type) {
         case ccl_coll_reduce: return "reduce";
         case ccl_coll_reduce_scatter: return "reduce_scatter";
         case ccl_coll_sparse_allreduce: return "sparse_allreduce";
-        case ccl_coll_internal: return "internal";
         case ccl_coll_partial: return "partial";
+        case ccl_coll_undefined: return "undefined";
         default: return "unknown";
     }
     return "unknown";
 }
+
+void ccl_get_segment_sizes(size_t dtype_size,
+                           size_t elem_count,
+                           size_t requested_seg_size,
+                           std::vector<size_t>& seg_sizes) {
+    seg_sizes.clear();
+
+    if (dtype_size * elem_count == 0) {
+        return;
+    }
+    else if (dtype_size >= requested_seg_size) {
+        seg_sizes.resize(elem_count, 1);
+    }
+    else {
+        size_t seg_size = (requested_seg_size + dtype_size - 1) / dtype_size;
+        size_t total_seg_count = std::max((elem_count + seg_size - 1) / seg_size, 1UL);
+        size_t regular_seg_size = elem_count / total_seg_count;
+        size_t large_seg_size = regular_seg_size + ((elem_count % total_seg_count) != 0);
+        size_t regular_seg_count = total_seg_count * large_seg_size - elem_count;
+
+        seg_sizes.resize(total_seg_count, regular_seg_size);
+        std::fill(seg_sizes.begin() + regular_seg_count, seg_sizes.end(), large_seg_size);
+
+        size_t sum = std::accumulate(seg_sizes.begin(), seg_sizes.end(), 0);
+        if (sum != elem_count) {
+            std::stringstream ss;
+            for (size_t idx = 0; idx < seg_sizes.size(); idx++) {
+                ss << seg_sizes[idx] << " ";
+            }
+            CCL_THROW_IF_NOT(false,
+                             "unexpected sum of seg_sizes ",
+                             sum,
+                             ", expected ",
+                             elem_count,
+                             ", total_seg_count ",
+                             total_seg_count,
+                             ", regular_seg_count ",
+                             regular_seg_count,
+                             ", regular_seg_size ",
+                             regular_seg_size,
+                             ", large_seg_size ",
+                             large_seg_size,
+                             ", all seg_sizes: ",
+                             ss.str());
+        }
+    }
+}
diff --git a/src/coll/algorithms/algorithms_enum.hpp b/src/coll/algorithms/algorithm_utils.hpp
similarity index 64%
rename from src/coll/algorithms/algorithms_enum.hpp
rename to src/coll/algorithms/algorithm_utils.hpp
index 880991e44..f90b43b2d 100644
--- a/src/coll/algorithms/algorithms_enum.hpp
+++ b/src/coll/algorithms/algorithm_utils.hpp
@@ -14,8 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "common/utils/enums.hpp"
 
+#include <vector>
+
+#include "common/utils/enums.hpp"
 #include "oneapi/ccl/types.hpp"
 
 #define CCL_COLL_LIST \
@@ -30,7 +32,8 @@ enum ccl_coll_allgatherv_algo {
     ccl_coll_allgatherv_naive,
     ccl_coll_allgatherv_ring,
     ccl_coll_allgatherv_flat,
-    ccl_coll_allgatherv_multi_bcast
+    ccl_coll_allgatherv_multi_bcast,
+    ccl_coll_allgatherv_topo
 };
 
 enum ccl_coll_allreduce_algo {
@@ -38,13 +41,13 @@ enum ccl_coll_allreduce_algo {
 
     ccl_coll_allreduce_direct,
     ccl_coll_allreduce_rabenseifner,
-    ccl_coll_allreduce_starlike,
+    ccl_coll_allreduce_nreduce,
     ccl_coll_allreduce_ring,
     ccl_coll_allreduce_ring_rma,
     ccl_coll_allreduce_double_tree,
     ccl_coll_allreduce_recursive_doubling,
     ccl_coll_allreduce_2d,
-    ccl_coll_allreduce_topo_ring
+    ccl_coll_allreduce_topo
 };
 
 enum ccl_coll_alltoall_algo {
@@ -79,7 +82,7 @@ enum ccl_coll_bcast_algo {
     ccl_coll_bcast_ring,
     ccl_coll_bcast_double_tree,
     ccl_coll_bcast_naive,
-    ccl_coll_bcast_topo_ring
+    ccl_coll_bcast_topo
 };
 
 enum ccl_coll_reduce_algo {
@@ -89,14 +92,15 @@ enum ccl_coll_reduce_algo {
     ccl_coll_reduce_rabenseifner,
     ccl_coll_reduce_tree,
     ccl_coll_reduce_double_tree,
-    ccl_coll_reduce_topo_ring
+    ccl_coll_reduce_topo
 };
 
 enum ccl_coll_reduce_scatter_algo {
     ccl_coll_reduce_scatter_undefined = 0,
 
     ccl_coll_reduce_scatter_direct,
-    ccl_coll_reduce_scatter_ring
+    ccl_coll_reduce_scatter_ring,
+    ccl_coll_reduce_scatter_topo
 };
 
 enum ccl_coll_sparse_allreduce_algo {
@@ -136,54 +140,15 @@ enum ccl_coll_type {
     ccl_coll_sparse_allreduce,
     ccl_coll_last_regular = ccl_coll_sparse_allreduce,
 
-    ccl_coll_internal,
     ccl_coll_partial,
+    ccl_coll_undefined,
 
     ccl_coll_last_value
 };
 
-// Currently ccl_coll_type is used in both compile-time and run-time contexts, so
-// need to have both versions of the check.
-// It's possible to have a constexpr function, but it requires some features from c++14
-// (e.g. multiple returns in constexpr functions)
-
-template <ccl_coll_type ctype, class Enable = void>
-struct is_reduction_coll_type : std::false_type {};
-
-// Reduction types
-template <ccl_coll_type ctype>
-struct is_reduction_coll_type<
-    ctype,
-    typename std::enable_if<ctype == ccl_coll_allreduce || ctype == ccl_coll_reduce ||
-                            ctype == ccl_coll_reduce_scatter>::type> : std::true_type {};
-
-bool ccl_coll_type_is_reduction(ccl_coll_type ctype);
 const char* ccl_coll_type_to_str(ccl_coll_type type);
 
-#define CCL_COLL_TYPE_LIST \
-    ccl_coll_type::ccl_coll_allgatherv, ccl_coll_type::ccl_coll_allreduce, \
-        ccl_coll_type::ccl_coll_alltoall, ccl_coll_type::ccl_coll_alltoallv, \
-        ccl_coll_type::ccl_coll_barrier, ccl_coll_type::ccl_coll_bcast, \
-        ccl_coll_type::ccl_coll_reduce, ccl_coll_type::ccl_coll_reduce_scatter, \
-        ccl_coll_type::ccl_coll_sparse_allreduce
-
-enum ccl_coll_reduction {
-    sum,
-    prod,
-    min,
-    max,
-    //custom, TODO: make support of custom reduction in *.cl
-
-    last_value
-};
-
-#define REDUCE_TYPES \
-    ccl::reduction::sum, ccl::reduction::prod, ccl::reduction::min, \
-        ccl::reduction::max /*, ccl::reduction::custom*/
-
-using ccl_reductions =
-    utils::enum_to_str<static_cast<typename std::underlying_type<ccl::reduction>::type>(
-        ccl::reduction::custom)>;
-inline const std::string reduction_to_str(ccl::reduction reduction_type) {
-    return ccl_reductions({ "sum", "prod", "min", "max" }).choose(reduction_type, "INVALID_VALUE");
-}
+void ccl_get_segment_sizes(size_t dtype_size,
+                           size_t elem_count,
+                           size_t requested_seg_size,
+                           std::vector<size_t>& seg_sizes);
diff --git a/src/coll/algorithms/algorithms.hpp b/src/coll/algorithms/algorithms.hpp
index 712de99f8..0fdce349c 100644
--- a/src/coll/algorithms/algorithms.hpp
+++ b/src/coll/algorithms/algorithms.hpp
@@ -38,14 +38,14 @@ ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
                                                         int root,
                                                         ccl_comm* comm);
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 ccl::status ccl_coll_build_gpu_bcast(ccl_sched* sched,
                                      ccl_buffer buf,
                                      size_t count,
                                      const ccl_datatype& dtype,
                                      int root,
                                      ccl_comm* comm);
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm);
 
@@ -58,7 +58,7 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
                                                int root,
                                                ccl_comm* comm);
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 ccl::status ccl_coll_build_gpu_reduce(ccl_sched* sched,
                                       ccl_buffer send_buf,
                                       ccl_buffer recv_buf,
@@ -67,7 +67,7 @@ ccl::status ccl_coll_build_gpu_reduce(ccl_sched* sched,
                                       ccl::reduction reduction,
                                       int root,
                                       ccl_comm* comm);
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
                                                   ccl_buffer send_buf,
@@ -110,23 +110,24 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
                                                         ccl::reduction reduction,
                                                         ccl_comm* comm);
 
-ccl::status ccl_coll_build_starlike_allreduce(ccl_sched* sched,
-                                              ccl_buffer send_buf,
-                                              ccl_buffer recv_buf,
-                                              size_t count,
-                                              const ccl_datatype& dtype,
-                                              ccl::reduction reduction,
-                                              ccl_comm* comm);
+ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
+                                             ccl_buffer send_buf,
+                                             ccl_buffer recv_buf,
+                                             size_t count,
+                                             const ccl_datatype& dtype,
+                                             ccl::reduction reduction,
+                                             ccl_comm* comm);
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-ccl::status ccl_coll_build_gpu_allreduce(ccl_sched* sched,
-                                         ccl_buffer send_buf,
-                                         ccl_buffer recv_buf,
-                                         size_t count,
-                                         const ccl_datatype& dtype,
-                                         ccl::reduction reduction,
-                                         ccl_comm* comm);
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction reduction,
+                                          ccl_comm* comm);
+
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
                                             ccl_buffer send_buf,
@@ -225,6 +226,14 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_master_sched* main_sched,
                                                   const ccl_coll_param& coll_param,
                                                   size_t data_partition_count);
 
+ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm);
+
 ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
                                            std::vector<ccl_sched*>& scheds,
                                            const ccl_coll_param& coll_param);
@@ -295,3 +304,11 @@ ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  const ccl_datatype& dtype,
                                                  ccl::reduction reduction,
                                                  ccl_comm* comm);
+
+ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t send_count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               ccl_comm* comm);
diff --git a/src/coll/algorithms/allgatherv.cpp b/src/coll/algorithms/allgatherv.cpp
index 98d1bd160..e43a7474b 100644
--- a/src/coll/algorithms/allgatherv.cpp
+++ b/src/coll/algorithms/allgatherv.cpp
@@ -13,12 +13,16 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include <numeric>
-
 #include "coll/algorithms/algorithms.hpp"
+#include "common/comm/comm.hpp"
 #include "sched/entry/coll/coll_entry_helper.hpp"
 #include "sched/entry/factory/chunked_entry_factory.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+#include "coll/coll_util.hpp"
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
+
+#include <numeric>
 
 ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
                                              ccl_buffer send_buf,
@@ -29,7 +33,7 @@ ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
                                              ccl_comm* comm) {
     LOG_DEBUG("build direct allgatherv");
 
-    entry_factory::make_entry<allgatherv_entry>(
+    entry_factory::create<allgatherv_entry>(
         sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm);
     return ccl::status::success;
 }
@@ -43,35 +47,37 @@ ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
                                             ccl_comm* comm) {
     LOG_DEBUG("build naive allgatherv");
 
+    ccl::status status = ccl::status::success;
+
     int comm_size = comm->size();
-    int this_rank = comm->rank();
+    int comm_rank = comm->rank();
     size_t dtype_size = dtype.size();
-    size_t* offsets = static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "offsets"));
-    ccl::status status = ccl::status::success;
+    std::vector<size_t> offsets(comm_size);
 
     offsets[0] = 0;
-    for (int rank_idx = 1; rank_idx < comm_size; ++rank_idx) {
-        offsets[rank_idx] = offsets[rank_idx - 1] + recv_counts[rank_idx - 1] * dtype_size;
+    for (int rank = 1; rank < comm_size; rank++) {
+        offsets[rank] = offsets[rank - 1] + recv_counts[rank - 1] * dtype_size;
     }
 
     if (send_buf != recv_buf) {
         // out-of-place case
-        entry_factory::make_entry<copy_entry>(
-            sched, send_buf, recv_buf + offsets[this_rank], send_count, dtype);
+        entry_factory::create<copy_entry>(
+            sched, send_buf, recv_buf + offsets[comm_rank], send_count, dtype);
     }
 
-    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
-        if (rank_idx != this_rank) {
-            // send own buffer to other ranks
-            entry_factory::make_chunked_send_entry(
-                sched, recv_buf + offsets[this_rank], send_count, dtype, rank_idx, comm);
-            // recv other's rank buffer
-            entry_factory::make_chunked_recv_entry(
-                sched, recv_buf + offsets[rank_idx], recv_counts[rank_idx], dtype, rank_idx, comm);
-        }
+    for (int idx = 1; idx < comm_size; idx++) {
+        int dst = (comm_rank + idx) % comm_size;
+        int src = (comm_rank - idx + comm_size) % comm_size;
+
+        // send own buffer to other ranks
+        entry_factory::create<send_entry>(
+            sched, recv_buf + offsets[comm_rank], send_count, dtype, dst, comm);
+
+        // recv other's rank buffer
+        entry_factory::create<recv_entry>(
+            sched, recv_buf + offsets[src], recv_counts[src], dtype, src, comm);
     }
 
-    CCL_FREE(offsets);
     return status;
 }
 
@@ -99,7 +105,7 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
     }
 
     if (send_buf != recv_buf) {
-        entry_factory::make_entry<copy_entry>(
+        entry_factory::create<copy_entry>(
             sched, send_buf, recv_buf + offsets[rank], send_count, dtype);
     }
 
@@ -124,8 +130,8 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
         sbuf = recv_buf + send_block_offset;
         rbuf = recv_buf + recv_block_offset;
 
-        entry_factory::make_entry<send_entry>(sched, sbuf, send_block_count, dtype, dst, comm);
-        entry_factory::make_entry<recv_entry>(sched, rbuf, recv_block_count, dtype, src, comm);
+        entry_factory::create<send_entry>(sched, sbuf, send_block_count, dtype, dst, comm);
+        entry_factory::create<recv_entry>(sched, rbuf, recv_block_count, dtype, src, comm);
         sched->add_barrier();
 
         block_idx = (comm_size + block_idx - 1) % comm_size; // move left
@@ -197,13 +203,13 @@ ccl::status ccl_coll_build_flat_allgatherv(ccl_master_sched* main_sched,
                                ccl_buffer_type::INDIRECT);
 
     if (!inplace) {
-        entry_factory::make_entry<copy_entry>(scheds[2 * comm_rank % sched_count],
-                                              ccl_buffer(coll_param.get_send_buf_ptr(),
-                                                         coll_param.get_send_count() * dtype_size,
-                                                         ccl_buffer_type::INDIRECT),
-                                              recv_bufs[comm_rank],
-                                              coll_param.get_recv_count(comm_rank),
-                                              dtype);
+        entry_factory::create<copy_entry>(scheds[2 * comm_rank % sched_count],
+                                          ccl_buffer(coll_param.get_send_buf_ptr(),
+                                                     coll_param.get_send_count() * dtype_size,
+                                                     ccl_buffer_type::INDIRECT),
+                                          recv_bufs[comm_rank],
+                                          coll_param.get_recv_count(comm_rank),
+                                          dtype);
     }
     else {
         size_t total_recv_bytes =
@@ -225,19 +231,19 @@ ccl::status ccl_coll_build_flat_allgatherv(ccl_master_sched* main_sched,
         if (static_cast<int>(idx) == comm_rank)
             continue;
 
-        entry_factory::make_entry<recv_entry>(scheds[(comm_rank + idx) % sched_count],
-                                              recv_bufs[idx],
-                                              coll_param.get_recv_count(idx),
-                                              dtype,
-                                              idx,
-                                              comm);
-
-        entry_factory::make_entry<send_entry>(scheds[(comm_rank + idx) % sched_count],
-                                              send_seg,
-                                              coll_param.get_recv_count(comm_rank),
-                                              dtype,
-                                              idx,
-                                              comm);
+        entry_factory::create<recv_entry>(scheds[(comm_rank + idx) % sched_count],
+                                          recv_bufs[idx],
+                                          coll_param.get_recv_count(idx),
+                                          dtype,
+                                          idx,
+                                          comm);
+
+        entry_factory::create<send_entry>(scheds[(comm_rank + idx) % sched_count],
+                                          send_seg,
+                                          coll_param.get_recv_count(comm_rank),
+                                          dtype,
+                                          idx,
+                                          comm);
     }
     main_sched->sync_partial_scheds();
 
@@ -279,15 +285,14 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_master_sched* main_sched,
         CCL_ASSERT(scheds.size() >= data_partition_count);
 
         for (size_t idx = 0; idx < data_partition_count; idx++) {
-            entry_factory::make_entry<copy_entry>(
-                scheds[idx],
-                ccl_buffer(coll_param.get_send_buf_ptr(),
-                           coll_param.get_send_count() * dtype_size,
-                           copy_offsets[idx],
-                           ccl_buffer_type::INDIRECT),
-                recv_bufs[comm_rank] + copy_offsets[idx],
-                copy_counts[idx],
-                dtype);
+            entry_factory::create<copy_entry>(scheds[idx],
+                                              ccl_buffer(coll_param.get_send_buf_ptr(),
+                                                         coll_param.get_send_count() * dtype_size,
+                                                         copy_offsets[idx],
+                                                         ccl_buffer_type::INDIRECT),
+                                              recv_bufs[comm_rank] + copy_offsets[idx],
+                                              copy_counts[idx],
+                                              dtype);
         }
         main_sched->sync_partial_scheds();
     }
@@ -306,3 +311,164 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_master_sched* main_sched,
 
     return ccl::status::success;
 }
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+
+ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm) {
+    LOG_DEBUG("build topo allgatherv");
+
+    ccl_comm* pair_comm = comm->get_pair_comm().get();
+    ccl_comm* even_comm = comm->get_even_comm().get();
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    ccl_comm* r2r_comm = comm->get_r2r_comm().get();
+
+    int comm_size = comm->size();
+    int pair_comm_size = pair_comm->size();
+    int node_comm_size = node_comm->size();
+    int r2r_comm_size = r2r_comm->size();
+
+    bool is_inplace = send_buf == recv_buf;
+    bool is_single_node = comm_size == node_comm_size;
+
+    const std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
+        { send_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0
+        { recv_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 1
+    };
+
+    size_t send_buf_idx = 0;
+    size_t recv_buf_idx = 1;
+
+    ccl::add_handle_exchange(sched, node_comm, in_buffers);
+
+    if (is_single_node) {
+        std::vector<ze_event_handle_t> wait_events;
+        entry_factory::create<ze_a2a_allgatherv_entry>(sched,
+                                                       send_buf,
+                                                       send_count,
+                                                       recv_buf,
+                                                       recv_counts,
+                                                       dtype,
+                                                       comm,
+                                                       wait_events,
+                                                       recv_buf_idx);
+        sched->add_barrier();
+        ccl::add_comm_barrier(sched, comm);
+        return ccl::status::success;
+    }
+
+    // helper function
+    auto get_distance = [&](int from, int to) {
+        CCL_THROW_IF_NOT(from >= 0, "from: ", from, " to: ", to);
+        CCL_THROW_IF_NOT(from <= to, "from: ", from, " to: ", to);
+        CCL_THROW_IF_NOT(to <= comm_size, "from: ", from, " to: ", to);
+        return std::accumulate(recv_counts + from, recv_counts + to, 0);
+    };
+
+    if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
+        /* 1. allocate send && recv tmp host buffers for host bcast stage */
+        int pair_start = pair_comm->get_global_rank(0, true);
+        size_t host_send_buf_count = get_distance(pair_start, pair_start + pair_comm_size);
+        size_t host_send_buf_bytes = host_send_buf_count * dtype.size();
+
+        size_t host_recv_buf_count{}; // calculate max pair size in recv_count
+        for (int rank = 0; rank < comm_size; rank += pair_comm_size) {
+            size_t count = get_distance(rank, rank + pair_comm_size);
+            host_recv_buf_count = std::max(host_recv_buf_count, count);
+        }
+        size_t host_recv_buf_bytes = host_recv_buf_count * dtype.size();
+
+        LOG_DEBUG("alloc host tmp buffers for bcast: send_buf: ",
+                  host_send_buf_bytes,
+                  ", recv_buf: ",
+                  host_recv_buf_bytes);
+        ccl::alloc_param host_send_buf_alloc(
+            host_send_buf_bytes, ccl::buffer_type::regular, ccl::buffer_place::host);
+        ccl_buffer send_host_buf = sched->alloc_buffer(host_send_buf_alloc);
+
+        ccl::alloc_param host_recv_buf_alloc(
+            host_recv_buf_bytes, ccl::buffer_type::regular, ccl::buffer_place::host);
+        ccl_buffer recv_host_buf = sched->alloc_buffer(host_recv_buf_alloc);
+
+        /* 2. copy to host */
+        for (int peer_rank = 0, dst_offset{}; peer_rank < pair_comm_size; ++peer_rank) {
+            int global_rank = pair_comm->get_global_rank(peer_rank, true) -
+                              ccl::global_data::env().kernel_1s_lead;
+            size_t copy_count = recv_counts[global_rank];
+            ccl_buffer src{};
+            size_t src_offset = (is_inplace) ? get_distance(0, global_rank) : 0;
+            copy_attr attr(
+                peer_rank, send_buf_idx, copy_direction::d2h, pair_comm, src_offset, dst_offset);
+            if (peer_rank == pair_comm->rank()) {
+                src = send_buf;
+                attr = copy_attr(copy_direction::d2h, src_offset);
+            }
+            LOG_DEBUG("copy to host: from global rank: ", global_rank, ", count: ", copy_count);
+            entry_factory::create<copy_entry>(sched, src, send_host_buf, copy_count, dtype, attr);
+            dst_offset += copy_count;
+        }
+        sched->add_barrier();
+
+        /* 3. bcast between nodes */
+        for (int peer_rank = 0; peer_rank < r2r_comm_size; ++peer_rank) {
+            ccl_buffer buf = recv_host_buf;
+            if (peer_rank == r2r_comm->rank()) {
+                buf = send_host_buf;
+            }
+
+            int global_rank = r2r_comm->get_global_rank(peer_rank, true);
+            int r2r_start = global_rank - ccl::global_data::env().kernel_1s_lead;
+            size_t copy_count = get_distance(r2r_start, r2r_start + pair_comm_size);
+            LOG_DEBUG("bcast: peer_rank: ", global_rank, ", count ", copy_count);
+            ccl_coll_build_bcast(sched, buf, copy_count, dtype, peer_rank, r2r_comm);
+            sched->add_barrier();
+
+            size_t dst_offset = get_distance(0, r2r_start);
+            LOG_DEBUG("copy to device: offset: ", dst_offset, ", count: ", copy_count);
+            entry_factory::create<copy_entry>(sched,
+                                              buf,
+                                              recv_buf,
+                                              copy_count,
+                                              dtype,
+                                              copy_attr(copy_direction::h2d, 0, dst_offset));
+            sched->add_barrier();
+        }
+        ccl::add_comm_barrier(sched, even_comm);
+
+        /* 4. allgatherv in even_comm */
+        for (int node_idx = 0; node_idx < r2r_comm_size; ++node_idx) {
+            int from = (comm->rank() - ccl::global_data::env().kernel_1s_lead +
+                        node_idx * node_comm_size) %
+                       comm_size; // TODO: fix lead
+            int to = from + pair_comm_size;
+            size_t count = get_distance(from, to);
+            size_t offset = get_distance(0, from);
+            for (int i = 0; i < even_comm->size() - 1; ++i) {
+                int peer_rank = (even_comm->rank() + i + 1) % even_comm->size();
+                copy_attr attr(
+                    peer_rank, recv_buf_idx, copy_direction::d2d, even_comm, offset, offset);
+                entry_factory::create<copy_entry>(
+                    sched, recv_buf, ccl_buffer(), count, dtype, attr);
+            }
+        }
+        sched->add_barrier();
+        ccl::add_comm_barrier(sched, even_comm);
+
+        /* 5. copy to peer pair rank */
+        size_t copy_count = get_distance(0, comm_size);
+        int peer_rank = (pair_comm->rank() + 1) % pair_comm_size;
+        copy_attr attr(peer_rank, recv_buf_idx, copy_direction::d2d, pair_comm);
+        entry_factory::create<copy_entry>(sched, recv_buf, ccl_buffer(), copy_count, dtype, attr);
+        sched->add_barrier();
+    }
+    ccl::add_comm_barrier(sched, pair_comm);
+
+    return ccl::status::success;
+}
+
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/allreduce/allreduce.cpp b/src/coll/algorithms/allreduce/allreduce.cpp
index 89c6d36fb..9b02a8695 100644
--- a/src/coll/algorithms/allreduce/allreduce.cpp
+++ b/src/coll/algorithms/allreduce/allreduce.cpp
@@ -21,10 +21,15 @@
  */
 
 #include "coll/algorithms/algorithms.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
+#include "common/comm/comm.hpp"
 #include "sched/entry/coll/coll_entry_helper.hpp"
+#include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/factory/chunked_entry_factory.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+#include "coll/coll_util.hpp"
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
                                             ccl_buffer send_buf,
@@ -35,7 +40,7 @@ ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
                                             ccl_comm* comm) {
     LOG_DEBUG("build direct allreduce");
 
-    entry_factory::make_entry<allreduce_entry>(sched, send_buf, recv_buf, count, dtype, op, comm);
+    entry_factory::create<allreduce_entry>(sched, send_buf, recv_buf, count, dtype, op, comm);
     return ccl::status::success;
 }
 
@@ -57,12 +62,12 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
 
     comm_size = comm->size();
     rank = comm->rank();
-    ccl_buffer tmp_buf = sched->alloc_buffer(count * dtype_size);
+    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
 
     /* copy local data into recv_buf */
 
     if (send_buf != recv_buf) {
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
         sched->add_barrier();
     }
 
@@ -82,7 +87,7 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
 
     if (rank < 2 * rem) {
         if (rank % 2 == 0) { /* even */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
             sched->add_barrier();
 
             /* temporarily set the rank to -1 so that this
@@ -91,13 +96,13 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
             newrank = CCL_INVALID_PROC_IDX;
         }
         else { /* odd */
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
             sched->add_barrier();
 
             /* do the reduction on received data. since the
              * ordering is right, it doesn't matter whether
              * the operation is commutative or not. */
-            entry_factory::make_entry<reduce_local_entry>(
+            entry_factory::create<reduce_local_entry>(
                 sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
             sched->add_barrier();
 
@@ -160,27 +165,24 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
             CCL_ASSERT(can_use_recv_reduce);
 
             if (can_use_recv_reduce) {
-                entry_factory::make_entry<recv_reduce_entry>(
-                    sched,
-                    (recv_buf + disps[recv_idx] * dtype_size),
-                    recv_cnt,
-                    nullptr,
-                    dtype,
-                    op,
-                    dst,
-                    ccl_buffer(),
-                    comm);
-                entry_factory::make_entry<send_entry>(
+                entry_factory::create<recv_reduce_entry>(sched,
+                                                         (recv_buf + disps[recv_idx] * dtype_size),
+                                                         recv_cnt,
+                                                         dtype,
+                                                         op,
+                                                         dst,
+                                                         comm);
+                entry_factory::create<send_entry>(
                     sched, (recv_buf + disps[send_idx] * dtype_size), send_cnt, dtype, dst, comm);
                 sched->add_barrier();
             }
 
             else {
                 /* Send data from recv_buf. Recv into tmp_buf */
-                entry_factory::make_entry<recv_entry>(
+                entry_factory::create<recv_entry>(
                     sched, (tmp_buf + disps[recv_idx] * dtype_size), recv_cnt, dtype, dst, comm);
                 /* sendrecv, no barrier here */
-                entry_factory::make_entry<send_entry>(
+                entry_factory::create<send_entry>(
                     sched, (recv_buf + disps[send_idx] * dtype_size), send_cnt, dtype, dst, comm);
                 sched->add_barrier();
 
@@ -189,14 +191,13 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
 
                 /* This algorithm is used only for predefined ops
                  * and predefined ops are always commutative. */
-                entry_factory::make_entry<reduce_local_entry>(
-                    sched,
-                    (tmp_buf + disps[recv_idx] * dtype_size),
-                    recv_cnt,
-                    (recv_buf + disps[recv_idx] * dtype_size),
-                    nullptr,
-                    dtype,
-                    op);
+                entry_factory::create<reduce_local_entry>(sched,
+                                                          (tmp_buf + disps[recv_idx] * dtype_size),
+                                                          recv_cnt,
+                                                          (recv_buf + disps[recv_idx] * dtype_size),
+                                                          nullptr,
+                                                          dtype,
+                                                          op);
                 sched->add_barrier();
             }
 
@@ -239,10 +240,10 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
                     recv_cnt += cnts[i];
             }
 
-            entry_factory::make_entry<recv_entry>(
+            entry_factory::create<recv_entry>(
                 sched, (recv_buf + disps[recv_idx] * dtype_size), recv_cnt, dtype, dst, comm);
             /* sendrecv, no barrier here */
-            entry_factory::make_entry<send_entry>(
+            entry_factory::create<send_entry>(
                 sched, (recv_buf + disps[send_idx] * dtype_size), send_cnt, dtype, dst, comm);
             sched->add_barrier();
 
@@ -258,10 +259,10 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
      * (rank-1), the ranks who didn't participate above. */
     if (rank < 2 * rem) {
         if (rank % 2) { /* odd */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
         }
         else { /* even */
-            entry_factory::make_entry<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+            entry_factory::create<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
         }
     }
 
@@ -290,11 +291,11 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
 
     size_t dtype_size = dtype.size();
 
-    ccl_buffer tmp_buf = sched->alloc_buffer(count * dtype_size);
+    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
 
     /* copy local data into recv_buf */
     if (send_buf != recv_buf) {
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
         sched->add_barrier();
     }
 
@@ -313,7 +314,7 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
 
     if (rank < 2 * rem) {
         if (rank % 2 == 0) { /* even */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
             sched->add_barrier();
 
             /* temporarily set the rank to -1 so that this
@@ -322,14 +323,14 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
             newrank = -1;
         }
         else { /* odd */
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, rank - 1, comm);
             sched->add_barrier();
 
             /* do the reduction on received data. since the
              * ordering is right, it doesn't matter whether
              * the operation is commutative or not. */
 
-            entry_factory::make_entry<reduce_local_entry>(
+            entry_factory::create<reduce_local_entry>(
                 sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
             sched->add_barrier();
 
@@ -349,14 +350,14 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
 
             /* Send the most current data, which is in recv_buf. Recv
              * into tmp_buf */
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, count, dtype, dst, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, dst, comm);
             /* sendrecv, no barrier here */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, dst, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, dst, comm);
             sched->add_barrier();
 
             /* tmp_buf contains data received in this step.
              * recv_buf contains data accumulated so far */
-            entry_factory::make_entry<reduce_local_entry>(
+            entry_factory::create<reduce_local_entry>(
                 sched, tmp_buf, count, recv_buf, nullptr, dtype, op);
             sched->add_barrier();
 
@@ -369,10 +370,10 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
      * (rank-1), the ranks who didn't participate above. */
     if (rank < 2 * rem) {
         if (rank % 2) { /* odd */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
         }
         else { /* even */
-            entry_factory::make_entry<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
+            entry_factory::create<recv_entry>(sched, recv_buf, count, dtype, rank + 1, comm);
         }
         sched->add_barrier();
     }
@@ -380,82 +381,145 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
     return status;
 }
 
-ccl::status ccl_coll_build_starlike_allreduce(ccl_sched* sched,
-                                              ccl_buffer send_buf,
-                                              ccl_buffer recv_buf,
-                                              size_t count,
-                                              const ccl_datatype& dtype,
-                                              ccl::reduction op,
-                                              ccl_comm* comm) {
-    LOG_DEBUG("build starlike allreduce");
+ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
+                                             ccl_buffer send_buf,
+                                             ccl_buffer recv_buf,
+                                             size_t count,
+                                             const ccl_datatype& dtype,
+                                             ccl::reduction op,
+                                             ccl_comm* comm) {
+    LOG_DEBUG("build nreduce allreduce");
 
     ccl::status status = ccl::status::success;
     int comm_size = comm->size();
-    int this_rank = comm->rank();
-    size_t* buffer_counts =
-        static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "buffer_count"));
-    size_t* buffer_offsets =
-        static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "buffer_offsets"));
+    int comm_rank = comm->rank();
+    std::vector<size_t> elem_counts(comm_size);
+    std::vector<size_t> elem_offsets(comm_size);
     size_t dtype_size = dtype.size();
+    bool is_inplace = (send_buf == recv_buf);
 
-    // copy local data into recv_buf
-    if (send_buf != recv_buf) {
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
-        sched->add_barrier();
+    if (comm_size == 1) {
+        if (!is_inplace) {
+            entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        }
+        return status;
     }
 
-    if (comm_size == 1)
-        return status;
+    int use_buffering = ccl::global_data::env().allreduce_nreduce_buffering;
 
-    // calculate counts and offsets for each rank
-    size_t common_buffer_count = count / comm_size;
-    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
-        buffer_counts[rank_idx] = common_buffer_count;
-        buffer_offsets[rank_idx] = rank_idx * buffer_counts[rank_idx] * dtype_size;
+    size_t segment_size = 2 * 1024 * 1024;
+    if (ccl::global_data::env().allreduce_nreduce_segment_size != CCL_ENV_SIZET_NOT_SPECIFIED) {
+        segment_size = ccl::global_data::env().allreduce_nreduce_segment_size;
     }
-    buffer_counts[comm_size - 1] += count % comm_size;
-
-    // recv_reduce buffer for current rank
-    size_t this_rank_buf_size = buffer_counts[this_rank] * dtype_size;
-
-    ccl_buffer tmp_buf;
-    if (this_rank_buf_size)
-        tmp_buf = sched->alloc_buffer(this_rank_buf_size * (comm_size - 1));
-
-    size_t tmp_buf_recv_idx = 0;
-    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
-        if (rank_idx != this_rank) {
-            // send buffer to others
-            entry_factory::make_chunked_send_entry(sched,
-                                                   recv_buf + buffer_offsets[rank_idx],
-                                                   buffer_counts[rank_idx],
-                                                   dtype,
-                                                   rank_idx,
-                                                   comm);
-
-            // recv part of buffer from others and perform reduce
-            entry_factory::make_chunked_recv_reduce_entry(
-                sched,
-                recv_buf + buffer_offsets[this_rank],
-                buffer_counts[this_rank],
-                nullptr,
-                dtype,
-                op,
-                rank_idx,
-                tmp_buf + this_rank_buf_size * tmp_buf_recv_idx,
-                comm);
-            ++tmp_buf_recv_idx;
+
+    std::vector<size_t> segment_sizes;
+    ccl_get_segment_sizes(dtype_size, count, segment_size, segment_sizes);
+
+    size_t tmp_buf_size = *segment_sizes.rbegin() * comm_size * dtype_size * 2;
+    ccl_buffer tmp_buf = sched->alloc_buffer({ tmp_buf_size, send_buf });
+
+    size_t seg_offset = 0;
+
+    for (size_t seg_idx = 0; seg_idx < segment_sizes.size(); seg_idx++) {
+        size_t seg_size = segment_sizes[seg_idx];
+
+        ccl_buffer seg_send_buf = send_buf + seg_offset;
+        ccl_buffer seg_recv_buf = recv_buf + seg_offset;
+        ccl_buffer seg_tmp_buf = tmp_buf + (seg_idx % 2) * (tmp_buf_size / 2);
+
+        seg_offset += seg_size * dtype_size;
+
+        // calculate counts and offsets for each rank
+        size_t common_buffer_count = seg_size / comm_size;
+        for (int idx = 0; idx < comm_size; idx++) {
+            elem_counts[idx] = common_buffer_count;
+            elem_offsets[idx] = idx * elem_counts[idx] * dtype_size;
         }
-    }
+        elem_counts[comm_size - 1] += seg_size % comm_size;
 
-    sched->add_barrier();
+        size_t elem_count = elem_counts[comm_rank];
+
+        ccl_buffer reduce_buf;
+        if (use_buffering) {
+            reduce_buf = seg_tmp_buf + elem_count * comm_rank * dtype_size;
+        }
+        else {
+            reduce_buf = seg_recv_buf + elem_offsets[comm_rank];
+        }
 
-    // allgatherv
-    CCL_CALL(ccl_coll_build_naive_allgatherv(
-        sched, recv_buf, buffer_counts[this_rank], recv_buf, buffer_counts, dtype, comm));
+        if (!is_inplace || use_buffering) {
+            entry_factory::create<copy_entry>(sched,
+                                              seg_send_buf + elem_offsets[comm_rank],
+                                              reduce_buf,
+                                              elem_counts[comm_rank],
+                                              dtype);
+            sched->add_barrier();
+        }
+
+        // reduce-scatter
+        for (int idx = 1; idx < comm_size; idx++) {
+            int dst = (comm_rank - idx + comm_size) % comm_size;
 
-    CCL_FREE(buffer_counts);
-    CCL_FREE(buffer_offsets);
+            // send part of buffer to other rank
+            entry_factory::create<send_entry>(
+                sched, seg_send_buf + elem_offsets[dst], elem_counts[dst], dtype, dst, comm);
+        }
+
+        for (int idx = 1; idx < comm_size; idx++) {
+            int src = (comm_rank + idx) % comm_size;
+
+            // recv part of buffer from other rank and perform reduce
+            entry_factory::create<recv_reduce_entry>(sched,
+                                                     reduce_buf,
+                                                     elem_count,
+                                                     dtype,
+                                                     op,
+                                                     src,
+                                                     comm,
+                                                     seg_tmp_buf + elem_count * src * dtype_size);
+        }
+
+        sched->add_barrier();
+
+        // allgatherv
+        if (use_buffering) {
+            copy_attr attr;
+            attr.direction = copy_direction::h2h;
+            attr.use_nontemporal = true;
+
+            // copy own result from tmp to recv buffer
+            entry_factory::create<copy_entry>(
+                sched, reduce_buf, seg_recv_buf + elem_offsets[comm_rank], elem_count, dtype, attr);
+            sched->add_barrier();
+
+            for (int idx = 1; idx < comm_size; idx++) {
+                int dst = (comm_rank + idx) % comm_size;
+                int src = (comm_rank - idx + comm_size) % comm_size;
+
+                // send own result to other ranks
+                entry_factory::create<send_entry>(
+                    sched, reduce_buf, elem_counts[comm_rank], dtype, dst, comm);
+
+                // recv other's rank result into tmp buffer and copy to recv buffer
+                entry_factory::create<recv_copy_entry>(sched,
+                                                       seg_tmp_buf + elem_offsets[src],
+                                                       seg_recv_buf + elem_offsets[src],
+                                                       elem_counts[src] * dtype_size,
+                                                       src,
+                                                       comm,
+                                                       attr);
+            }
+        }
+        else {
+            CCL_CALL(ccl_coll_build_naive_allgatherv(sched,
+                                                     seg_recv_buf,
+                                                     elem_counts[comm_rank],
+                                                     seg_recv_buf,
+                                                     elem_counts.data(),
+                                                     dtype,
+                                                     comm));
+        }
+    }
 
     return status;
 }
@@ -500,163 +564,220 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
     return status;
 }
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
-ccl::status ccl_coll_build_gpu_allreduce(ccl_sched* sched,
-                                         ccl_buffer send_buf,
-                                         ccl_buffer recv_buf,
-                                         size_t count,
-                                         const ccl_datatype& dtype,
-                                         ccl::reduction op,
-                                         ccl_comm* comm) {
-    LOG_DEBUG("build gpu allreduce");
+ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction op,
+                                          ccl_comm* comm) {
+    LOG_DEBUG("build topo allreduce");
 
-    const std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
+    std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
         { send_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0
         { recv_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 1
     };
 
-    ccl_coll_entry_param barrier_param{};
-    barrier_param.ctype = ccl_coll_barrier;
-    barrier_param.comm = comm;
-    barrier_param.hint_algo.barrier = ccl_coll_barrier_ring;
+    size_t ipc_event_count{};
+    size_t max_ipc_event_count{ 6 };
+    ze_event_pool_handle_t ipc_event_pool{};
+    if (ccl::global_data::env().enable_ze_barrier) {
+        ipc_event_pool = sched->get_memory().ipc_event_pool_manager.create(max_ipc_event_count);
+        in_buffers.push_back({ static_cast<void*>(ipc_event_pool), ccl::ze::ipc_mem_type::pool });
+    }
+
+    ccl_comm* pair_comm = comm->get_pair_comm().get();
+    ccl_comm* even_comm = comm->get_even_comm().get();
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    ccl_comm* r2r_comm = comm->get_r2r_comm().get();
+
+    int comm_size = comm->size();
+    int even_comm_size = even_comm->size();
+    int node_comm_size = node_comm->size();
 
-    ccl_comm* pair_comm = comm->get_host_comm()->get_pair_comm().get()->get_ccl_comm().get();
-    ccl_comm* even_comm = comm->get_host_comm()->get_even_comm().get()->get_ccl_comm().get();
-    ccl_comm* node_comm = comm->get_host_comm()->get_node_comm().get()->get_ccl_comm().get();
-    ccl_comm* r2r_comm = comm->get_host_comm()->get_r2r_comm().get()->get_ccl_comm().get();
+    bool is_single_node = (comm_size == node_comm_size);
+    bool is_single_card = (comm_size == 2) && is_single_node;
+    bool is_multi_card = (even_comm_size > 1);
 
-    int skip_rank = -1;
-    if (ccl::global_data::env().enable_kernel_1s_ipc_wa) {
+    size_t recv_buf_idx = 1;
+
+    int skip_rank = ccl_comm::invalid_rank;
+    if (ccl::global_data::env().enable_kernel_1s_ipc_wa && is_single_card) {
         skip_rank = ccl::global_data::env().kernel_1s_lead;
     }
 
-    if (sched->coll_attr.to_cache) {
-        sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
-        entry_factory::make_entry<ze_handle_exchange_entry>(
-            sched, node_comm, in_buffers, skip_rank);
-        sched->add_barrier();
-        sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
+    ccl::add_handle_exchange(
+        sched, node_comm, in_buffers, skip_rank, ipc_event_pool, ipc_event_count++);
 
-        // TODO: no need barrier for the first iteration where ze_handle_exchange_entry exists
-        // TODO: think about the right way
-        coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
-    }
-    else {
-        entry_factory::make_entry<ze_handle_exchange_entry>(
-            sched, node_comm, in_buffers, skip_rank);
-    }
+    CCL_THROW_IF_NOT(comm_size % 2 == 0, "unexpected comm_size ", comm_size);
+    CCL_THROW_IF_NOT(node_comm_size % 2 == 0, "unexpected node_comm_size ", node_comm_size);
 
-    sched->add_barrier();
+    bool use_single_list = sched->enable_ze_single_list();
 
-    if (comm->size() == 4) {
-        LOG_DEBUG("node_comm: id: ",
-                  node_comm->id(),
-                  ", size: ",
-                  node_comm->size(),
-                  ", rank: ",
-                  node_comm->rank());
-
-        if (node_comm->size() == 2) {
-            LOG_DEBUG("r2r_comm: id: ",
-                      r2r_comm->id(),
-                      ", size: ",
-                      r2r_comm->size(),
-                      ", rank: ",
-                      r2r_comm->rank());
-
-            if (node_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
-                entry_factory::make_entry<ze_reduce_entry>(
-                    sched, send_buf, recv_buf, count, dtype, op, node_comm->rank(), node_comm);
-                sched->add_barrier();
-                ccl_buffer host_buf = sched->alloc_buffer(count * dtype.size());
-                entry_factory::make_entry<copy_entry>(
-                    sched, recv_buf, host_buf, count, dtype, copy_attr(copy_direction::d2h));
-                sched->add_barrier();
-                ccl_coll_build_allreduce(sched, host_buf, host_buf, count, dtype, op, r2r_comm);
-                sched->add_barrier();
-                entry_factory::make_entry<copy_entry>(
-                    sched, host_buf, recv_buf, count, dtype, copy_attr(copy_direction::h2d));
+    if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
+        std::vector<ze_event_handle_t> wait_events;
+        if (is_single_card) {
+            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_allreduce");
+            auto entry = entry_factory::create<ze_onesided_allreduce_entry>(
+                sched, send_buf, recv_buf, count, dtype, op, pair_comm, wait_events);
+            wait_events.push_back(entry->entry_event);
+        }
+        else {
+            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
+            auto entry = entry_factory::create<ze_onesided_reduce_entry>(sched,
+                                                                         send_buf,
+                                                                         recv_buf,
+                                                                         count,
+                                                                         dtype,
+                                                                         op,
+                                                                         pair_comm->rank(),
+                                                                         pair_comm,
+                                                                         wait_events);
+            wait_events.push_back(entry->entry_event);
+        }
+        sched->add_barrier();
+
+        size_t main_block_count = count / even_comm_size;
+        size_t block_count = main_block_count;
+        if (even_comm->rank() == even_comm_size - 1) {
+            block_count += count % even_comm_size;
+        }
+
+        if (is_multi_card) {
+            auto barrier_event = ccl::add_comm_barrier(
+                sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+            wait_events.push_back(barrier_event);
+
+            if (is_single_node) {
+                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allreduce");
+                auto entry = entry_factory::create<ze_a2a_allreduce_entry>(sched,
+                                                                           recv_buf,
+                                                                           recv_buf,
+                                                                           count,
+                                                                           dtype,
+                                                                           op,
+                                                                           even_comm,
+                                                                           wait_events,
+                                                                           recv_buf_idx);
+                wait_events.push_back(entry->entry_event);
                 sched->add_barrier();
-                entry_factory::make_entry<copy_entry>(
-                    sched,
-                    recv_buf,
-                    ccl_buffer(),
-                    count,
-                    dtype,
-                    copy_attr((node_comm->rank() + 1) % node_comm->size(), 1, copy_direction::d2d));
+
+                auto barrier_event = ccl::add_comm_barrier(
+                    sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+                wait_events.push_back(barrier_event);
+            }
+            else {
+                size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
+                ccl_buffer partial_recv_buf = recv_buf + offset_bytes;
+                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
+                std::vector<size_t> block_counts(even_comm->size(), main_block_count);
+                block_counts.back() = block_count;
+                auto entry = entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
+                                                                                recv_buf,
+                                                                                partial_recv_buf,
+                                                                                block_counts.data(),
+                                                                                dtype,
+                                                                                op,
+                                                                                even_comm,
+                                                                                wait_events,
+                                                                                recv_buf_idx);
+                wait_events.push_back(entry->entry_event);
                 sched->add_barrier();
+
+                auto barrier_event = ccl::add_comm_barrier(
+                    sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+                wait_events.push_back(barrier_event);
             }
-            barrier_param.comm = comm;
-            coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
         }
-        else if (node_comm->size() == 4) {
-            LOG_DEBUG("pair_comm: id: ",
-                      pair_comm->id(),
-                      ", size: ",
-                      pair_comm->size(),
-                      ", rank: ",
-                      pair_comm->rank());
-
-            LOG_DEBUG("even_comm: id: ",
-                      even_comm->id(),
-                      ", size: ",
-                      even_comm->size(),
-                      ", rank: ",
-                      even_comm->rank());
-
-            if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
-                entry_factory::make_entry<ze_reduce_entry>(
-                    sched, send_buf, recv_buf, count, dtype, op, pair_comm->rank(), pair_comm);
-                sched->add_barrier();
 
-                barrier_param.comm = even_comm;
-                coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
-                sched->add_barrier();
+        if (!is_single_node && block_count) {
+            LOG_DEBUG("topo/scale_out: use host_allreduce");
+            ccl::alloc_param alloc_param(
+                block_count * dtype.size(), ccl::buffer_type::regular, ccl::buffer_place::host);
+            ccl_buffer host_buf = sched->alloc_buffer(alloc_param);
+            size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
+            ccl_buffer partial_recv_buf = recv_buf + offset_bytes;
+            auto entry = entry_factory::create<ze_copy_entry>(sched,
+                                                              partial_recv_buf,
+                                                              host_buf,
+                                                              block_count,
+                                                              dtype,
+                                                              copy_attr(copy_direction::d2h),
+                                                              wait_events);
+            wait_events.push_back(entry->entry_event);
+            sched->add_barrier();
 
-                if (even_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
-                    entry_factory::make_entry<ze_allreduce_entry>(
-                        sched, recv_buf, recv_buf, count, dtype, op, even_comm);
-                    sched->add_barrier();
-                }
+            if (use_single_list) {
+                ccl::add_wait_events(sched, wait_events);
             }
 
-            barrier_param.comm = comm;
-            coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
+            ccl_coll_build_allreduce(sched, host_buf, host_buf, block_count, dtype, op, r2r_comm);
             sched->add_barrier();
 
-            if (pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
-                entry_factory::make_entry<copy_entry>(
-                    sched,
-                    ccl_buffer(),
-                    recv_buf,
-                    count,
-                    dtype,
-                    copy_attr((pair_comm->rank() + 1) % pair_comm->size(),
-                              1,
-                              copy_direction::d2d,
-                              pair_comm));
-                sched->add_barrier();
+            if (use_single_list) {
+                auto signal_event = ccl::add_signal_event(sched);
+                wait_events.push_back(signal_event);
             }
+
+            entry = entry_factory::create<ze_copy_entry>(sched,
+                                                         host_buf,
+                                                         partial_recv_buf,
+                                                         block_count,
+                                                         dtype,
+                                                         copy_attr(copy_direction::h2d),
+                                                         wait_events);
+            wait_events.push_back(entry->entry_event);
+            sched->add_barrier();
         }
-        else {
-            CCL_THROW("unexpected node_comm size: ", node_comm->size());
+
+        if (is_multi_card && !is_single_node) {
+            LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allgatherv");
+            std::vector<size_t> recv_counts(even_comm_size, main_block_count);
+            recv_counts.at(even_comm->rank()) = block_count;
+            auto entry = entry_factory::create<ze_a2a_allgatherv_entry>(sched,
+                                                                        recv_buf,
+                                                                        block_count,
+                                                                        recv_buf,
+                                                                        recv_counts.data(),
+                                                                        dtype,
+                                                                        even_comm,
+                                                                        wait_events,
+                                                                        recv_buf_idx);
+            wait_events.push_back(entry->entry_event);
+            sched->add_barrier();
+
+            auto barrier_event = ccl::add_comm_barrier(
+                sched, even_comm, wait_events, ipc_event_pool, ipc_event_count++);
+            wait_events.push_back(barrier_event);
         }
-    }
-    else if (comm->size() == 2) {
-        if (comm->rank() == ccl::global_data::env().kernel_1s_lead) {
-            entry_factory::make_entry<ze_allreduce_entry>(
-                sched, send_buf, recv_buf, count, dtype, op, comm);
+
+        if (!is_single_card) {
+            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
+            int peer_rank = (pair_comm->rank() + 1) % pair_comm->size();
+            auto entry = entry_factory::create<ze_copy_entry>(
+                sched,
+                recv_buf,
+                ccl_buffer(),
+                count,
+                dtype,
+                copy_attr(peer_rank, recv_buf_idx, copy_direction::d2d, pair_comm),
+                wait_events);
+            wait_events.push_back(entry->entry_event);
             sched->add_barrier();
         }
-        barrier_param.comm = comm;
-        coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
-    }
-    else {
-        CCL_THROW("unexpected comm size: ", comm->size());
     }
 
+    ccl::add_comm_barrier(sched, pair_comm, ipc_event_pool, ipc_event_count++);
+
+    CCL_THROW_IF_NOT(ipc_event_count <= max_ipc_event_count,
+                     "unexpected ipc_event_count ",
+                     ipc_event_count,
+                     ", expected max ",
+                     max_ipc_event_count);
+
     return ccl::status::success;
 }
 
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/allreduce/allreduce_2d.cpp b/src/coll/algorithms/allreduce/allreduce_2d.cpp
index 54e5b2719..2457bcee9 100644
--- a/src/coll/algorithms/allreduce/allreduce_2d.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_2d.cpp
@@ -23,35 +23,30 @@ ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size,
                                                    ccl_comm* comm) {
     parent_comm = comm;
 
-    size_t vector_size = comm->size();
-    std::vector<int> first_dim_colors(vector_size), second_dim_colors(vector_size);
+    int first_dim_color, second_dim_color;
 
-    for (size_t idx = 0; idx < vector_size; idx++) {
-        if (switch_dims) {
-            first_dim_colors[idx] = idx / base_size;
-            second_dim_colors[idx] = idx % base_size;
-        }
-        else {
-            first_dim_colors[idx] = idx % base_size;
-            second_dim_colors[idx] = idx / base_size;
-        }
+    if (switch_dims) {
+        first_dim_color = comm->rank() / base_size;
+        second_dim_color = comm->rank() % base_size;
+    }
+    else {
+        first_dim_color = comm->rank() % base_size;
+        second_dim_color = comm->rank() / base_size;
     }
 
-    first_dim_comm = std::shared_ptr<ccl_comm>(ccl_comm::create_with_colors(
-        first_dim_colors, ccl::global_data::get().comm_ids.get(), comm, true /*share_resources*/));
+    first_dim_comm = std::shared_ptr<ccl_comm>(comm->create_with_color(
+        first_dim_color, ccl::global_data::get().comm_ids.get(), true /*share_resources*/));
 
-    second_dim_comm = std::shared_ptr<ccl_comm>(ccl_comm::create_with_colors(
-        second_dim_colors, ccl::global_data::get().comm_ids.get(), comm, true /*share_resources*/));
+    second_dim_comm = std::shared_ptr<ccl_comm>(comm->create_with_color(
+        second_dim_color, ccl::global_data::get().comm_ids.get(), true /*share_resources*/));
 
     if (comm->rank() == 0) {
         std::string first_dim_ranks, second_dim_ranks;
         for (int idx = 0; idx < first_dim_comm->size(); idx++) {
-            first_dim_ranks +=
-                ((idx) ? " " : "") + std::to_string(first_dim_comm->get_global_rank(idx));
+            first_dim_ranks += ((idx) ? " " : "") + std::to_string(idx);
         }
         for (int idx = 0; idx < second_dim_comm->size(); idx++) {
-            second_dim_ranks +=
-                ((idx) ? " " : "") + std::to_string(second_dim_comm->get_global_rank(idx));
+            second_dim_ranks += ((idx) ? " " : "") + std::to_string(idx);
         }
 
         std::stringstream ss;
@@ -79,8 +74,8 @@ static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
                                                      ccl_comm* comm,
                                                      size_t chunk_idx,
                                                      size_t chunk_count) {
-    ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
-    ccl_comm* second_dim_comm = comm->allreduce_2d_builder->get_second_dim_comm();
+    ccl_comm* first_dim_comm = comm->get_allreduce_2d_builder()->get_first_dim_comm();
+    ccl_comm* second_dim_comm = comm->get_allreduce_2d_builder()->get_second_dim_comm();
 
     size_t dtype_size = dtype.size();
     size_t main_chunk_size = count / chunk_count;
@@ -96,7 +91,7 @@ static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
     if (ar_count) {
         /* TODO: add second level selection to distinguish high and low level algorithms */
         ccl_buffer ar_buf = rbuf + first_dim_comm->rank() * main_block_count * dtype_size;
-        ccl_coll_build_starlike_allreduce(
+        ccl_coll_build_nreduce_allreduce(
             sched, ar_buf, ar_buf, ar_count, dtype, op, second_dim_comm);
         sched->add_barrier();
     }
@@ -116,7 +111,7 @@ static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* s
                                                                     ccl_comm* comm,
                                                                     size_t chunk_idx,
                                                                     size_t chunk_count) {
-    ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
+    ccl_comm* first_dim_comm = comm->get_allreduce_2d_builder()->get_first_dim_comm();
 
     size_t dtype_size = dtype.size();
     size_t main_chunk_size = count / chunk_count;
@@ -133,7 +128,7 @@ static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* s
             sched, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
     }
     else {
-        entry_factory::make_entry<subsched_entry>(
+        entry_factory::create<subsched_entry>(
             sched,
             chunk_idx,
             [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
@@ -142,7 +137,7 @@ static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* s
             },
             "AR_AG");
 
-        entry_factory::make_entry<subsched_entry>(
+        entry_factory::create<subsched_entry>(
             sched,
             chunk_idx + 1,
             [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.cpp b/src/coll/algorithms/allreduce/allreduce_rma.cpp
index a91357f6d..da418f555 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.cpp
@@ -146,7 +146,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
 
     if (comm_size == 1) {
         if (!inplace) {
-            entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+            entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
             sched->add_barrier();
         }
         return ccl::status::success;
@@ -160,25 +160,25 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
 
     sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
 
-    entry_factory::make_entry<register_entry>(
+    entry_factory::create<register_entry>(
         sched,
         2 * comm_size * sizeof(uint64_t),
         ccl_buffer(ar_handler->sync_flags, 2 * comm_size * sizeof(uint64_t)),
         &ar_handler->sync_flags_mr,
         comm);
-    entry_factory::make_entry<register_entry>(
+    entry_factory::create<register_entry>(
         sched,
         sizeof(uint64_t),
         ccl_buffer((void*)&ar_handler->sync_flag, sizeof(uint64_t)),
         &ar_handler->sync_flag_mr,
         comm);
-    entry_factory::make_entry<register_entry>(
+    entry_factory::create<register_entry>(
         sched,
         sizeof(uint64_t),
         ccl_buffer((void*)&ar_handler->dst_ready_flag, sizeof(uint64_t)),
         &ar_handler->dst_ready_flag_mr,
         comm);
-    entry_factory::make_entry<register_entry>(
+    entry_factory::create<register_entry>(
         sched,
         sizeof(uint64_t),
         ccl_buffer(&ar_handler->dst_ready_value, sizeof(uint64_t)),
@@ -187,13 +187,13 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
 
     if (inplace) {
         tmp_buf = sched->alloc_buffer(count * dtype_size);
-        entry_factory::make_entry<register_entry>(
+        entry_factory::create<register_entry>(
             sched, count * dtype_size, tmp_buf, &ar_handler->tmp_buf_mr, comm);
     }
     else
-        entry_factory::make_entry<register_entry>(
+        entry_factory::create<register_entry>(
             sched, count * dtype_size, send_buf, &ar_handler->send_buf_mr, comm);
-    entry_factory::make_entry<register_entry>(
+    entry_factory::create<register_entry>(
         sched, count * dtype_size, recv_buf, &ar_handler->recv_buf_mr, comm);
 
     sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
@@ -205,24 +205,23 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
     ar_handler->src_peer = (comm_size + rank - 1) % comm_size;
     ar_handler->dst_peer = (comm_size + rank + 1) % comm_size;
 
-    entry_factory::make_entry<function_entry>(
-        sched, rma_ring_allreduce_reset_sync_flag, ar_handler);
+    entry_factory::create<function_entry>(sched, rma_ring_allreduce_reset_sync_flag, ar_handler);
     sched->add_barrier();
 
     sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
 
     if (inplace) {
-        send_entry* e = entry_factory::make_entry<send_entry>(
-            sched,
-            ccl_buffer(&ar_handler->tmp_buf_mr, sizeof(atl_mr_t)),
-            sizeof(atl_mr_t),
-            ccl_datatype_int8,
-            ar_handler->src_peer,
-            comm);
+        send_entry* e =
+            entry_factory::create<send_entry>(sched,
+                                              ccl_buffer(&ar_handler->tmp_buf_mr, sizeof(atl_mr_t)),
+                                              sizeof(atl_mr_t),
+                                              ccl_datatype_int8,
+                                              ar_handler->src_peer,
+                                              comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_tmp_buf_mr, ar_handler);
     }
     else {
-        send_entry* e = entry_factory::make_entry<send_entry>(
+        send_entry* e = entry_factory::create<send_entry>(
             sched,
             ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
@@ -231,39 +230,37 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_recv_buf_mr, ar_handler);
     }
-    send_entry* e = entry_factory::make_entry<send_entry>(
-        sched,
-        ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
-        sizeof(atl_mr_t),
-        ccl_datatype_int8,
-        ar_handler->src_peer,
-        comm);
+    send_entry* e =
+        entry_factory::create<send_entry>(sched,
+                                          ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
+                                          sizeof(atl_mr_t),
+                                          ccl_datatype_int8,
+                                          ar_handler->src_peer,
+                                          comm);
     e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_recv_buf_mr, ar_handler);
 
-    e = entry_factory::make_entry<send_entry>(
-        sched,
-        ccl_buffer(&ar_handler->sync_flag_mr, sizeof(atl_mr_t)),
-        sizeof(atl_mr_t),
-        ccl_datatype_int8,
-        ar_handler->src_peer,
-        comm);
+    e = entry_factory::create<send_entry>(sched,
+                                          ccl_buffer(&ar_handler->sync_flag_mr, sizeof(atl_mr_t)),
+                                          sizeof(atl_mr_t),
+                                          ccl_datatype_int8,
+                                          ar_handler->src_peer,
+                                          comm);
     e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_sync_flag_mr, ar_handler);
 
-    entry_factory::make_entry<recv_entry>(
+    entry_factory::create<recv_entry>(
         sched,
         ccl_buffer(&ar_handler->remote_rs_dst_buf_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
         ccl_datatype_int8,
         ar_handler->dst_peer,
         comm);
-    entry_factory::make_entry<recv_entry>(
-        sched,
-        ccl_buffer(&ar_handler->remote_recv_buf_mr, sizeof(atl_mr_t)),
-        sizeof(atl_mr_t),
-        ccl_datatype_int8,
-        ar_handler->dst_peer,
-        comm);
-    entry_factory::make_entry<recv_entry>(
+    entry_factory::create<recv_entry>(sched,
+                                      ccl_buffer(&ar_handler->remote_recv_buf_mr, sizeof(atl_mr_t)),
+                                      sizeof(atl_mr_t),
+                                      ccl_datatype_int8,
+                                      ar_handler->dst_peer,
+                                      comm);
+    entry_factory::create<recv_entry>(
         sched,
         ccl_buffer(&ar_handler->remote_sync_flag_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
@@ -272,7 +269,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         comm);
 
     if (ar_handler->wait_dst) {
-        send_entry* e = entry_factory::make_entry<send_entry>(
+        send_entry* e = entry_factory::create<send_entry>(
             sched,
             ccl_buffer(ar_handler->dst_ready_flag_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
@@ -281,7 +278,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_dst_ready_flag_mr,
                                                    ar_handler);
-        entry_factory::make_entry<recv_entry>(
+        entry_factory::create<recv_entry>(
             sched,
             ccl_buffer(&ar_handler->remote_dst_ready_flag_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
@@ -296,7 +293,7 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
     if (ar_handler->wait_dst) {
         /* let src side to know that this rank (i.e. dst for src rank) is ready for write ops */
         ar_handler->dst_ready_value = 1;
-        write_entry* entry = entry_factory::make_entry<write_entry>(
+        write_entry* entry = entry_factory::create<write_entry>(
             sched,
             ccl_buffer(&ar_handler->dst_ready_value, sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
@@ -312,11 +309,11 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             rma_ring_allreduce_get_remote_dst_ready_flag_mr, ar_handler);
 
         /* wait when dst side will be ready for write ops */
-        entry_factory::make_entry<wait_value_entry>(
+        entry_factory::create<wait_value_entry>(
             sched, &(ar_handler->dst_ready_flag), 1, ccl_condition_equal);
 
         /* reset dst_ready_flag for next allreduce call */
-        entry_factory::make_entry<function_entry>(
+        entry_factory::create<function_entry>(
             sched, rma_ring_allreduce_reset_dst_ready_flag, ar_handler);
     }
 
@@ -337,15 +334,15 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         else
             src_buf = (idx == 0) ? send_buf : recv_buf;
 
-        write_entry* entry = entry_factory::make_entry<write_entry>(sched,
-                                                                    src_buf + buf_offset,
-                                                                    (atl_mr_t*)nullptr, /* src_mr */
-                                                                    block_count,
-                                                                    dtype,
-                                                                    ar_handler->dst_peer,
-                                                                    (atl_mr_t*)nullptr, /* dst_mr */
-                                                                    buf_offset,
-                                                                    comm);
+        write_entry* entry = entry_factory::create<write_entry>(sched,
+                                                                src_buf + buf_offset,
+                                                                (atl_mr_t*)nullptr, /* src_mr */
+                                                                block_count,
+                                                                dtype,
+                                                                ar_handler->dst_peer,
+                                                                (atl_mr_t*)nullptr, /* dst_mr */
+                                                                buf_offset,
+                                                                comm);
         entry->set_field_fn<ccl_sched_entry_field_src_mr>(
             (inplace) ? rma_ring_allreduce_get_recv_buf_mr
                       : ((idx == 0) ? rma_ring_allreduce_get_send_buf_mr
@@ -354,10 +351,10 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         entry->set_field_fn<ccl_sched_entry_field_dst_mr>(
             rma_ring_allreduce_get_remote_rs_dst_buf_mr, ar_handler);
 
-        if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size)
+        if (block_count * dtype.size() > atl_base_comm::attr.out.max_order_waw_size)
             sched->add_barrier();
 
-        entry = entry_factory::make_entry<write_entry>(
+        entry = entry_factory::create<write_entry>(
             sched,
             ccl_buffer(&ar_handler->sync_flags[idx], sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
@@ -378,18 +375,18 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             block_count += count % comm_size;
         buf_offset = main_block_count * dtype_size * block_idx;
 
-        entry_factory::make_entry<wait_value_entry>(
+        entry_factory::create<wait_value_entry>(
             sched, &(ar_handler->sync_flag), (idx + 1), ccl_condition_greater_or_equal);
 
         ccl_buffer reduce_in_buf = (inplace) ? tmp_buf : send_buf;
         ccl_buffer reduce_inout_buf = recv_buf;
-        entry_factory::make_entry<reduce_local_entry>(sched,
-                                                      reduce_in_buf + buf_offset,
-                                                      block_count,
-                                                      reduce_inout_buf + buf_offset,
-                                                      nullptr,
-                                                      dtype,
-                                                      op);
+        entry_factory::create<reduce_local_entry>(sched,
+                                                  reduce_in_buf + buf_offset,
+                                                  block_count,
+                                                  reduce_inout_buf + buf_offset,
+                                                  nullptr,
+                                                  dtype,
+                                                  op);
     }
 
     /* allgather */
@@ -401,24 +398,24 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         buf_offset = main_block_count * dtype_size * block_idx;
 
         ccl_buffer src_buf = recv_buf;
-        write_entry* entry = entry_factory::make_entry<write_entry>(sched,
-                                                                    src_buf + buf_offset,
-                                                                    (atl_mr_t*)nullptr, /* src_mr */
-                                                                    block_count,
-                                                                    dtype,
-                                                                    ar_handler->dst_peer,
-                                                                    (atl_mr_t*)nullptr, /* dst_mr */
-                                                                    buf_offset,
-                                                                    comm);
+        write_entry* entry = entry_factory::create<write_entry>(sched,
+                                                                src_buf + buf_offset,
+                                                                (atl_mr_t*)nullptr, /* src_mr */
+                                                                block_count,
+                                                                dtype,
+                                                                ar_handler->dst_peer,
+                                                                (atl_mr_t*)nullptr, /* dst_mr */
+                                                                buf_offset,
+                                                                comm);
         entry->set_field_fn<ccl_sched_entry_field_src_mr>(rma_ring_allreduce_get_recv_buf_mr,
                                                           ar_handler);
         entry->set_field_fn<ccl_sched_entry_field_dst_mr>(rma_ring_allreduce_get_remote_recv_buf_mr,
                                                           ar_handler);
 
-        if (block_count * dtype.size() > atl_wrapper::attr.out.max_order_waw_size)
+        if (block_count * dtype.size() > atl_base_comm::attr.out.max_order_waw_size)
             sched->add_barrier();
 
-        entry = entry_factory::make_entry<write_entry>(
+        entry = entry_factory::create<write_entry>(
             sched,
             ccl_buffer(&ar_handler->sync_flags[flag_idx_offset + idx], sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
@@ -435,10 +432,10 @@ ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
 
         block_idx = (block_idx + comm_size - 1) % comm_size;
 
-        entry_factory::make_entry<wait_value_entry>(sched,
-                                                    &(ar_handler->sync_flag),
-                                                    (flag_idx_offset + idx + 1),
-                                                    ccl_condition_greater_or_equal);
+        entry_factory::create<wait_value_entry>(sched,
+                                                &(ar_handler->sync_flag),
+                                                (flag_idx_offset + idx + 1),
+                                                ccl_condition_greater_or_equal);
     }
 
     return status;
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.hpp b/src/coll/algorithms/allreduce/allreduce_rma.hpp
index 76e2075c8..80613a4c9 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.hpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.hpp
@@ -15,8 +15,6 @@
 */
 #pragma once
 
-#include "atl/atl.h"
-
 typedef struct {
     int wait_dst;
 
diff --git a/src/coll/algorithms/alltoall.cpp b/src/coll/algorithms/alltoall.cpp
index 2bd43f5cf..acfa2e4b7 100644
--- a/src/coll/algorithms/alltoall.cpp
+++ b/src/coll/algorithms/alltoall.cpp
@@ -24,6 +24,6 @@ ccl::status ccl_coll_build_direct_alltoall(ccl_sched* sched,
                                            ccl_comm* comm) {
     LOG_DEBUG("build direct alltoall");
 
-    entry_factory::make_entry<alltoall_entry>(sched, send_buf, recv_buf, count, dtype, comm);
+    entry_factory::create<alltoall_entry>(sched, send_buf, recv_buf, count, dtype, comm);
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp
index caf063b4c..32eda791c 100644
--- a/src/coll/algorithms/alltoallv.cpp
+++ b/src/coll/algorithms/alltoallv.cpp
@@ -35,7 +35,7 @@ ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
                                             ccl_comm* comm) {
     LOG_DEBUG("build direct alltoallv");
 
-    entry_factory::make_entry<alltoallv_entry>(
+    entry_factory::create<alltoallv_entry>(
         sched, send_buf, send_counts, recv_buf, recv_counts, dtype, comm);
     return ccl::status::success;
 }
@@ -157,17 +157,17 @@ ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
 
     if (!inplace && send_counts[comm_rank] && recv_counts[comm_rank]) {
         size_t sched_idx = (2 * comm_rank) % sched_count;
-        entry_factory::make_entry<copy_entry>(scheds[sched_idx],
-                                              ccl_buffer(coll_param.get_send_buf_ptr(),
-                                                         total_send_bytes,
-                                                         send_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                         total_recv_bytes,
-                                                         recv_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              send_counts[comm_rank],
-                                              dtype);
+        entry_factory::create<copy_entry>(scheds[sched_idx],
+                                          ccl_buffer(coll_param.get_send_buf_ptr(),
+                                                     total_send_bytes,
+                                                     send_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                     total_recv_bytes,
+                                                     recv_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          send_counts[comm_rank],
+                                          dtype);
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
@@ -179,7 +179,8 @@ ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
         ccl_buffer recv_buf;
 
         if (inplace)
-            recv_buf = scheds[sched_idx]->alloc_buffer(recv_counts[idx] * dtype_size);
+            recv_buf = scheds[sched_idx]->alloc_buffer(
+                { recv_counts[idx] * dtype_size, coll_param.get_recv_buf() });
         else
             recv_buf = ccl_buffer(coll_param.get_recv_buf_ptr(),
                                   total_recv_bytes,
@@ -202,14 +203,14 @@ ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
 
         if (inplace) {
             scheds[sched_idx]->add_barrier();
-            entry_factory::make_entry<copy_entry>(scheds[sched_idx],
-                                                  recv_buf,
-                                                  ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                             total_recv_bytes,
-                                                             recv_offsets[idx],
-                                                             ccl_buffer_type::INDIRECT),
-                                                  recv_counts[idx],
-                                                  dtype);
+            entry_factory::create<copy_entry>(scheds[sched_idx],
+                                              recv_buf,
+                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                         total_recv_bytes,
+                                                         recv_offsets[idx],
+                                                         ccl_buffer_type::INDIRECT),
+                                              recv_counts[idx],
+                                              dtype);
             scheds[sched_idx]->add_barrier();
         }
     }
@@ -252,17 +253,17 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
 
     if (!inplace && send_counts[comm_rank] && recv_counts[comm_rank]) {
         size_t sched_idx = (2 * comm_rank) % sched_count;
-        entry_factory::make_entry<copy_entry>(scheds[sched_idx],
-                                              ccl_buffer(coll_param.get_send_buf_ptr(),
-                                                         total_send_bytes,
-                                                         send_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                         total_recv_bytes,
-                                                         recv_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              send_counts[comm_rank],
-                                              dtype);
+        entry_factory::create<copy_entry>(scheds[sched_idx],
+                                          ccl_buffer(coll_param.get_send_buf_ptr(),
+                                                     total_send_bytes,
+                                                     send_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                     total_recv_bytes,
+                                                     recv_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          send_counts[comm_rank],
+                                          dtype);
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
@@ -276,7 +277,8 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
         ccl_buffer recv_buf;
 
         if (inplace) {
-            recv_buf = scheds[sched_idx]->alloc_buffer(recv_counts[src] * dtype_size);
+            recv_buf = scheds[sched_idx]->alloc_buffer(
+                { recv_counts[src] * dtype_size, coll_param.get_recv_buf() });
             recv_bufs[src] = recv_buf;
         }
         else
@@ -321,14 +323,14 @@ ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
 
         size_t sched_idx = (comm_rank + idx) % sched_count;
 
-        entry_factory::make_entry<copy_entry>(scheds[sched_idx],
-                                              recv_bufs[idx],
-                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                         total_recv_bytes,
-                                                         recv_offsets[idx],
-                                                         ccl_buffer_type::INDIRECT),
-                                              recv_counts[idx],
-                                              dtype);
+        entry_factory::create<copy_entry>(scheds[sched_idx],
+                                          recv_bufs[idx],
+                                          ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                     total_recv_bytes,
+                                                     recv_offsets[idx],
+                                                     ccl_buffer_type::INDIRECT),
+                                          recv_counts[idx],
+                                          dtype);
     }
 
     return ccl::status::success;
@@ -378,13 +380,13 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
     std::vector<ccl_sched*> send_scheds(sched_count);
 
     for (size_t idx = 0; idx < sched_count; idx++) {
-        auto recv_sched = entry_factory::make_entry<subsched_entry>(
+        auto recv_sched = entry_factory::create<subsched_entry>(
                               scheds[idx], 0, [](ccl_sched* s) {}, "A2AV_RECV")
                               ->get_subsched();
 
         recv_scheds[idx] = recv_sched;
 
-        auto send_sched = entry_factory::make_entry<subsched_entry>(
+        auto send_sched = entry_factory::create<subsched_entry>(
                               scheds[idx], 0, [](ccl_sched* s) {}, "A2AV_SEND")
                               ->get_subsched();
 
@@ -393,17 +395,17 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
 
     if (!inplace && send_counts[comm_rank] && recv_counts[comm_rank]) {
         size_t sched_idx = (2 * comm_rank) % sched_count;
-        entry_factory::make_entry<copy_entry>(recv_scheds[sched_idx],
-                                              ccl_buffer(coll_param.get_send_buf_ptr(),
-                                                         total_send_bytes,
-                                                         send_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                         total_recv_bytes,
-                                                         recv_offsets[comm_rank],
-                                                         ccl_buffer_type::INDIRECT),
-                                              send_counts[comm_rank],
-                                              dtype);
+        entry_factory::create<copy_entry>(recv_scheds[sched_idx],
+                                          ccl_buffer(coll_param.get_send_buf_ptr(),
+                                                     total_send_bytes,
+                                                     send_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                     total_recv_bytes,
+                                                     recv_offsets[comm_rank],
+                                                     ccl_buffer_type::INDIRECT),
+                                          send_counts[comm_rank],
+                                          dtype);
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
@@ -420,7 +422,8 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
         ccl_buffer recv_buf;
 
         if (inplace) {
-            recv_buf = sched->alloc_buffer(recv_counts[src] * dtype_size);
+            recv_buf =
+                sched->alloc_buffer({ recv_counts[src] * dtype_size, coll_param.get_recv_buf() });
             recv_bufs[src] = recv_buf;
         }
         else
@@ -465,14 +468,14 @@ ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sche
 
         size_t sched_idx = (comm_rank + idx) % sched_count;
 
-        entry_factory::make_entry<copy_entry>(scheds[sched_idx],
-                                              recv_bufs[idx],
-                                              ccl_buffer(coll_param.get_recv_buf_ptr(),
-                                                         total_recv_bytes,
-                                                         recv_offsets[idx],
-                                                         ccl_buffer_type::INDIRECT),
-                                              recv_counts[idx],
-                                              dtype);
+        entry_factory::create<copy_entry>(scheds[sched_idx],
+                                          recv_bufs[idx],
+                                          ccl_buffer(coll_param.get_recv_buf_ptr(),
+                                                     total_recv_bytes,
+                                                     recv_offsets[idx],
+                                                     ccl_buffer_type::INDIRECT),
+                                          recv_counts[idx],
+                                          dtype);
     }
 
     return ccl::status::success;
diff --git a/src/coll/algorithms/barrier.cpp b/src/coll/algorithms/barrier.cpp
index 5aa05e094..d04290a45 100644
--- a/src/coll/algorithms/barrier.cpp
+++ b/src/coll/algorithms/barrier.cpp
@@ -25,7 +25,7 @@
 ccl::status ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm) {
     LOG_DEBUG("build direct barrier");
 
-    entry_factory::make_entry<barrier_entry>(sched, comm);
+    entry_factory::create<barrier_entry>(sched, comm);
     return ccl::status::success;
 }
 
@@ -44,8 +44,8 @@ ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* com
     while (mask < size) {
         dst = (rank + mask) % size;
         src = (rank - mask + size) % size;
-        entry_factory::make_entry<send_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, dst, comm);
-        entry_factory::make_entry<recv_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, src, comm);
+        entry_factory::create<send_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, dst, comm);
+        entry_factory::create<recv_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, src, comm);
         sched->add_barrier();
         mask <<= 1;
     }
diff --git a/src/coll/algorithms/bcast.cpp b/src/coll/algorithms/bcast.cpp
index c4ba99976..2183bf71c 100644
--- a/src/coll/algorithms/bcast.cpp
+++ b/src/coll/algorithms/bcast.cpp
@@ -34,7 +34,7 @@ ccl::status ccl_coll_build_direct_bcast(ccl_sched* sched,
                                         ccl_comm* comm) {
     LOG_DEBUG("build direct bcast");
 
-    entry_factory::make_entry<bcast_entry>(sched, buf, count, dtype, root, comm);
+    entry_factory::create<bcast_entry>(sched, buf, count, dtype, root, comm);
     return ccl::status::success;
 }
 
@@ -58,12 +58,12 @@ ccl::status ccl_coll_build_naive_bcast(ccl_sched* sched,
     if (rank == root) {
         for (idx = 0; idx < comm_size; idx++) {
             if (idx != rank) {
-                entry_factory::make_entry<send_entry>(sched, buf, count, dtype, idx, comm);
+                entry_factory::create<send_entry>(sched, buf, count, dtype, idx, comm);
             }
         }
     }
     else {
-        entry_factory::make_entry<recv_entry>(sched, buf, count, dtype, root, comm);
+        entry_factory::create<recv_entry>(sched, buf, count, dtype, root, comm);
     }
 
 fn_exit:
@@ -116,12 +116,12 @@ ccl::status ccl_coll_build_scatter_for_bcast(ccl_sched* sched,
             curr_size = recv_size;
 
             if (recv_size > 0) {
-                entry_factory::make_entry<recv_entry>(sched,
-                                                      tmp_buf + relative_rank * scatter_size,
-                                                      recv_size,
-                                                      ccl_datatype_int8,
-                                                      src,
-                                                      comm);
+                entry_factory::create<recv_entry>(sched,
+                                                  tmp_buf + relative_rank * scatter_size,
+                                                  recv_size,
+                                                  ccl_datatype_int8,
+                                                  src,
+                                                  comm);
                 sched->add_barrier();
             }
             break;
@@ -145,13 +145,12 @@ ccl::status ccl_coll_build_scatter_for_bcast(ccl_sched* sched,
                 if (dst >= comm_size)
                     dst -= comm_size;
 
-                entry_factory::make_entry<send_entry>(
-                    sched,
-                    tmp_buf + scatter_size * (relative_rank + mask),
-                    send_size,
-                    ccl_datatype_int8,
-                    dst,
-                    comm);
+                entry_factory::create<send_entry>(sched,
+                                                  tmp_buf + scatter_size * (relative_rank + mask),
+                                                  send_size,
+                                                  ccl_datatype_int8,
+                                                  dst,
+                                                  comm);
                 sched->add_barrier();
                 curr_size -= send_size;
             }
@@ -219,10 +218,10 @@ ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
         if (right_count < 0)
             right_count = 0;
         right_disp = rel_j * scatter_size;
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, tmp_buf + right_disp, right_count, ccl_datatype_int8, right, comm);
         /* sendrecv, no barrier here */
-        entry_factory::make_entry<recv_entry>(
+        entry_factory::create<recv_entry>(
             sched, tmp_buf + left_disp, left_count, ccl_datatype_int8, left, comm);
         sched->add_barrier();
 
@@ -234,7 +233,7 @@ ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
     return status;
 }
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
 ccl::status ccl_coll_build_gpu_bcast(ccl_sched* sched,
                                      ccl_buffer buf,
@@ -256,20 +255,20 @@ ccl::status ccl_coll_build_gpu_bcast(ccl_sched* sched,
 
     if (sched->coll_attr.to_cache) {
         sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
-        entry_factory::make_entry<ze_handle_exchange_entry>(sched, comm, buffers);
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, buffers);
         sched->add_barrier();
         sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
 
         coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
     }
     else {
-        entry_factory::make_entry<ze_handle_exchange_entry>(sched, comm, buffers);
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, buffers);
     }
 
     sched->add_barrier();
 
     if (comm->rank() != root) {
-        entry_factory::make_entry<copy_entry>(
+        entry_factory::create<copy_entry>(
             sched, ccl_buffer(), buf, count, dtype, copy_attr(root, 0, copy_direction::d2d));
         sched->add_barrier();
     }
@@ -279,4 +278,4 @@ ccl::status ccl_coll_build_gpu_bcast(ccl_sched* sched,
     return ccl::status::success;
 }
 
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/double_tree_ops.cpp b/src/coll/algorithms/double_tree_ops.cpp
index e124673f2..f9139033b 100644
--- a/src/coll/algorithms/double_tree_ops.cpp
+++ b/src/coll/algorithms/double_tree_ops.cpp
@@ -25,18 +25,18 @@ static void bcast_tree(const ccl_bin_tree& tree,
                        ccl_comm* comm) {
     if (tree.parent() != -1) {
         LOG_DEBUG("recv from parent ", tree.parent());
-        entry_factory::make_entry<recv_entry>(
+        entry_factory::create<recv_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.parent()), comm);
         sched->add_barrier();
     }
     if (tree.left() != -1) {
         LOG_DEBUG("send to left ", tree.left());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.left()), comm);
     }
     if (tree.right() != -1) {
         LOG_DEBUG("send to right ", tree.right());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.right()), comm);
     }
 }
@@ -50,34 +50,20 @@ static void reduce_tree(const ccl_bin_tree& tree,
                         ccl_comm* comm) {
     if (tree.left() != -1) {
         LOG_DEBUG("recv_reduce left ", tree.left());
-        entry_factory::make_entry<recv_reduce_entry>(sched,
-                                                     buffer,
-                                                     count,
-                                                     nullptr,
-                                                     dtype,
-                                                     reduction,
-                                                     static_cast<size_t>(tree.left()),
-                                                     ccl_buffer(),
-                                                     comm);
+        entry_factory::create<recv_reduce_entry>(
+            sched, buffer, count, dtype, reduction, static_cast<size_t>(tree.left()), comm);
     }
     if (tree.right() != -1) {
         LOG_DEBUG("recv_reduce right ", tree.right());
-        entry_factory::make_entry<recv_reduce_entry>(sched,
-                                                     buffer,
-                                                     count,
-                                                     nullptr,
-                                                     dtype,
-                                                     reduction,
-                                                     static_cast<size_t>(tree.right()),
-                                                     ccl_buffer(),
-                                                     comm);
+        entry_factory::create<recv_reduce_entry>(
+            sched, buffer, count, dtype, reduction, static_cast<size_t>(tree.right()), comm);
     }
     if (tree.parent() != -1) {
         if (tree.left() != -1 || tree.right() != -1) {
             sched->add_barrier();
         }
         LOG_DEBUG("send to parent ", tree.parent());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.parent()), comm);
     }
 }
@@ -91,27 +77,13 @@ static void reduce_bcast_tree(const ccl_bin_tree& tree,
                               ccl_comm* comm) {
     if (tree.left() != -1) {
         LOG_DEBUG("recv_reduce left ", tree.left());
-        entry_factory::make_entry<recv_reduce_entry>(sched,
-                                                     buffer,
-                                                     count,
-                                                     nullptr,
-                                                     dtype,
-                                                     reduction,
-                                                     static_cast<size_t>(tree.left()),
-                                                     ccl_buffer(),
-                                                     comm);
+        entry_factory::create<recv_reduce_entry>(
+            sched, buffer, count, dtype, reduction, static_cast<size_t>(tree.left()), comm);
     }
     if (tree.right() != -1) {
         LOG_DEBUG("recv_reduce right ", tree.right());
-        entry_factory::make_entry<recv_reduce_entry>(sched,
-                                                     buffer,
-                                                     count,
-                                                     nullptr,
-                                                     dtype,
-                                                     reduction,
-                                                     static_cast<size_t>(tree.right()),
-                                                     ccl_buffer(),
-                                                     comm);
+        entry_factory::create<recv_reduce_entry>(
+            sched, buffer, count, dtype, reduction, static_cast<size_t>(tree.right()), comm);
     }
     if (tree.parent() != -1) {
         if (tree.left() != -1 || tree.right() != -1) {
@@ -119,11 +91,11 @@ static void reduce_bcast_tree(const ccl_bin_tree& tree,
         }
 
         LOG_DEBUG("send to parent ", tree.parent());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.parent()), comm);
 
         LOG_DEBUG("recv from parent ", tree.parent());
-        entry_factory::make_entry<recv_entry>(
+        entry_factory::create<recv_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.parent()), comm);
     }
 
@@ -133,12 +105,12 @@ static void reduce_bcast_tree(const ccl_bin_tree& tree,
 
     if (tree.left() != -1) {
         LOG_DEBUG("send to left ", tree.left());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.left()), comm);
     }
     if (tree.right() != -1) {
         LOG_DEBUG("send to right ", tree.right());
-        entry_factory::make_entry<send_entry>(
+        entry_factory::create<send_entry>(
             sched, buffer, count, dtype, static_cast<size_t>(tree.right()), comm);
     }
 }
@@ -158,7 +130,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
 
     if (coll_type != ccl_coll_bcast && send_buf != recv_buf) {
         LOG_DEBUG("out of place op");
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
         sched->add_barrier();
     }
 
@@ -237,7 +209,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
 
         switch (coll_type) {
             case ccl_coll_bcast:
-                entry_factory::make_entry<subsched_entry>(
+                entry_factory::create<subsched_entry>(
                     sched,
                     t1_op_id,
                     [t1_work_buf, t1_work_count, &dtype, t1, comm](ccl_sched* s) {
@@ -245,7 +217,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                     },
                     "bcast_t1");
 
-                entry_factory::make_entry<subsched_entry>(
+                entry_factory::create<subsched_entry>(
                     sched,
                     t2_op_id,
                     [t2_work_buf, t2_work_count, &dtype, t2, comm](ccl_sched* s) {
@@ -257,7 +229,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
             case ccl_coll_reduce: {
                 if (comm->rank() % 2 == 0) {
                     //even ranks are leaves in T2, start schedule with T2
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t2_op_id,
                         [t2_work_buf, t2_work_count, &dtype, op, t2, comm](ccl_sched* s) {
@@ -265,7 +237,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         },
                         "reduce_t2");
 
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t1_op_id,
                         [t1_work_buf, t1_work_count, &dtype, op, t1, comm](ccl_sched* s) {
@@ -274,7 +246,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         "reduce_t1");
                 }
                 else {
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t2_op_id,
                         [t2_work_buf, t2_work_count, &dtype, op, t2, comm](ccl_sched* s) {
@@ -282,7 +254,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         },
                         "reduce_t2");
 
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t1_op_id,
                         [t1_work_buf, t1_work_count, &dtype, op, t1, comm](ccl_sched* s) {
@@ -296,7 +268,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
             case ccl_coll_allreduce: {
                 if (comm->rank() % 2 == 0) {
                     //even ranks are leaves in T2, start schedule with T2
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t2_op_id,
                         [t2_work_buf, t2_work_count, &dtype, op, t2, comm](ccl_sched* s) {
@@ -304,7 +276,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         },
                         "reduce_bcast_t2");
 
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t1_op_id,
                         [t1_work_buf, t1_work_count, &dtype, op, t1, comm](ccl_sched* s) {
@@ -313,7 +285,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         "reduce_bcast_t1");
                 }
                 else {
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t1_op_id,
                         [t1_work_buf, t1_work_count, &dtype, op, t1, comm](ccl_sched* s) {
@@ -321,7 +293,7 @@ ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                         },
                         "reduce_bcast_t1");
 
-                    entry_factory::make_entry<subsched_entry>(
+                    entry_factory::create<subsched_entry>(
                         sched,
                         t2_op_id,
                         [t2_work_buf, t2_work_count, &dtype, op, t2, comm](ccl_sched* s) {
diff --git a/src/coll/algorithms/reduce.cpp b/src/coll/algorithms/reduce.cpp
index 54a9a55d0..14eb503d9 100644
--- a/src/coll/algorithms/reduce.cpp
+++ b/src/coll/algorithms/reduce.cpp
@@ -21,8 +21,12 @@
  */
 
 #include "coll/algorithms/algorithms.hpp"
+#include "common/comm/comm.hpp"
 #include "sched/entry/coll/coll_entry_helper.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+#include "coll/coll_util.hpp"
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 /* An implementation of Rabenseifner's reduce algorithm (see
    http://www.hlrs.de/mpi/myreduce.html).
@@ -62,7 +66,7 @@ ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
                                          ccl_comm* comm) {
     LOG_DEBUG("build direct reduce");
 
-    entry_factory::make_entry<reduce_entry>(
+    entry_factory::create<reduce_entry>(
         sched, send_buf, recv_buf, count, dtype, reduction, root, comm);
     return ccl::status::success;
 }
@@ -89,7 +93,7 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
     comm_size = comm->size();
     rank = comm->rank();
 
-    ccl_buffer tmp_buf = sched->alloc_buffer(count * dtype_size);
+    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
 
     /* get nearest power-of-two less than or equal to comm_size */
     pof2 = comm->pof2();
@@ -99,11 +103,11 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
     /* If I'm not the root, then my recv_buf may not be valid, therefore
      * I have to allocate a temporary one */
     if (rank != local_root) {
-        recv_buf = sched->alloc_buffer(count * dtype_size);
+        recv_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
     }
 
     if ((rank != local_root) || (send_buf != recv_buf))
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
 
     /* In the non-power-of-two case, all odd-numbered
      * processes of rank < 2*rem send their data to
@@ -123,7 +127,7 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
 
     if (rank < 2 * rem) {
         if (rank % 2 != 0) { /* odd */
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, rank - 1, comm);
             sched->add_barrier();
 
             /* temporarily set the rank to -1 so that this
@@ -132,13 +136,13 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
             new_rank = CCL_INVALID_PROC_IDX;
         }
         else { /* even */
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, count, dtype, rank + 1, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, rank + 1, comm);
             sched->add_barrier();
 
             /* do the reduction on received data. */
             /* This algorithm is used only for predefined ops
              * and predefined ops are always commutative. */
-            entry_factory::make_entry<reduce_local_entry>(
+            entry_factory::create<reduce_local_entry>(
                 sched, tmp_buf, count, recv_buf, nullptr, dtype, reduction);
             sched->add_barrier();
 
@@ -196,10 +200,10 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
             }
 
             /* Send data from recv_buf. Recv into tmp_buf */
-            entry_factory::make_entry<send_entry>(
+            entry_factory::create<send_entry>(
                 sched, (recv_buf + disps[send_idx] * dtype_size), send_cnt, dtype, dst, comm);
             /* sendrecv, no barrier here */
-            entry_factory::make_entry<recv_entry>(
+            entry_factory::create<recv_entry>(
                 sched, (tmp_buf + disps[recv_idx] * dtype_size), recv_cnt, dtype, dst, comm);
             sched->add_barrier();
 
@@ -208,13 +212,13 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
 
             /* This algorithm is used only for predefined ops
              * and predefined ops are always commutative. */
-            entry_factory::make_entry<reduce_local_entry>(sched,
-                                                          (tmp_buf + disps[recv_idx] * dtype_size),
-                                                          recv_cnt,
-                                                          (recv_buf + disps[recv_idx] * dtype_size),
-                                                          nullptr,
-                                                          dtype,
-                                                          reduction);
+            entry_factory::create<reduce_local_entry>(sched,
+                                                      (tmp_buf + disps[recv_idx] * dtype_size),
+                                                      recv_cnt,
+                                                      (recv_buf + disps[recv_idx] * dtype_size),
+                                                      nullptr,
+                                                      dtype,
+                                                      reduction);
             sched->add_barrier();
 
             /* update send_idx for next iteration */
@@ -247,7 +251,7 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
                 for (i = 1; i < pof2; i++)
                     disps[i] = disps[i - 1] + cnts[i - 1];
 
-                entry_factory::make_entry<recv_entry>(sched, recv_buf, cnts[0], dtype, 0, comm);
+                entry_factory::create<recv_entry>(sched, recv_buf, cnts[0], dtype, 0, comm);
                 sched->add_barrier();
 
                 new_rank = 0;
@@ -255,7 +259,7 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
                 last_idx = 2;
             }
             else if (new_rank == 0) { /* send */
-                entry_factory::make_entry<send_entry>(
+                entry_factory::create<send_entry>(
                     sched, recv_buf, cnts[0], dtype, local_root, comm);
                 sched->add_barrier();
 
@@ -322,14 +326,14 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
             if (newdst_tree_root == newroot_tree_root) {
                 /* send and exit */
                 /* Send data from recv_buf. Recv into tmp_buf */
-                entry_factory::make_entry<send_entry>(
+                entry_factory::create<send_entry>(
                     sched, (recv_buf + disps[send_idx] * dtype_size), send_cnt, dtype, dst, comm);
                 sched->add_barrier();
                 break;
             }
             else {
                 /* recv and continue */
-                entry_factory::make_entry<recv_entry>(
+                entry_factory::create<recv_entry>(
                     sched, (recv_buf + disps[recv_idx] * dtype_size), recv_cnt, dtype, dst, comm);
                 sched->add_barrier();
             }
@@ -372,16 +376,16 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
 
     /* Create a temporary buffer */
     size_t dtype_size = dtype.size();
-    ccl_buffer tmp_buf = sched->alloc_buffer(count * dtype_size);
+    ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
 
     /* If I'm not the root, then my recv_buf may not be valid, therefore
      * I have to allocate a temporary one */
     if (rank != local_root) {
-        recv_buf = sched->alloc_buffer(count * dtype_size);
+        recv_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
     }
 
     if ((rank != local_root) || (send_buf != recv_buf)) {
-        entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+        entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
         sched->add_barrier();
     }
 
@@ -427,10 +431,10 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
             if (source < comm_size) {
                 source = (source + lroot) % comm_size;
 
-                entry_factory::make_entry<recv_entry>(sched, tmp_buf, count, dtype, source, comm);
+                entry_factory::create<recv_entry>(sched, tmp_buf, count, dtype, source, comm);
                 sched->add_barrier();
 
-                entry_factory::make_entry<reduce_local_entry>(
+                entry_factory::create<reduce_local_entry>(
                     sched, tmp_buf, count, recv_buf, nullptr, dtype, reduction);
                 sched->add_barrier();
             }
@@ -439,7 +443,7 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
             /* I've received all that I'm going to.  Send my result to
              * my parent */
             source = ((relrank & (~mask)) + lroot) % comm_size;
-            entry_factory::make_entry<send_entry>(sched, recv_buf, count, dtype, source, comm);
+            entry_factory::create<send_entry>(sched, recv_buf, count, dtype, source, comm);
             sched->add_barrier();
             break;
         }
@@ -449,55 +453,154 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
     return status;
 }
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
 ccl::status ccl_coll_build_gpu_reduce(ccl_sched* sched,
                                       ccl_buffer send_buf,
                                       ccl_buffer recv_buf,
                                       size_t count,
                                       const ccl_datatype& dtype,
-                                      ccl::reduction reduction,
+                                      ccl::reduction op,
                                       int root,
                                       ccl_comm* comm) {
     LOG_DEBUG("build gpu reduce");
 
-    int skip_rank = -1;
+    ccl_comm* pair_comm = comm->get_pair_comm().get();
+    ccl_comm* even_comm = comm->get_even_comm().get();
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    ccl_comm* r2r_comm = comm->get_r2r_comm().get();
 
-    const std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
+    int comm_size = comm->size();
+    int even_comm_size = even_comm->size();
+    int node_comm_size = node_comm->size();
+
+    bool is_single_node = (comm_size == node_comm_size);
+    bool is_single_card = (comm_size == 2) && is_single_node;
+    bool use_tmp_buf = !is_single_card;
+
+    ccl_buffer tmp_buf{};
+    ccl::alloc_param alloc_param(
+        count * dtype.size(), ccl::buffer_type::ze, ccl::buffer_place::device);
+    if (use_tmp_buf) {
+        tmp_buf = sched->alloc_buffer(alloc_param);
+    }
+
+    std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
         { send_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0
+        { recv_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 1
     };
 
-    ccl_coll_entry_param barrier_param{};
-    barrier_param.ctype = ccl_coll_barrier;
-    barrier_param.comm = comm;
-    barrier_param.hint_algo.barrier = ccl_coll_barrier_ring;
+    size_t recv_buf_idx = 1;
+    size_t tmp_buf_idx = std::numeric_limits<size_t>::max();
+    if (use_tmp_buf) {
+        tmp_buf_idx = in_buffers.size();
+        in_buffers.push_back({ tmp_buf.get_ptr(), ccl::ze::ipc_mem_type::memory });
+    }
 
-    if (sched->coll_attr.to_cache) {
-        sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
-        entry_factory::make_entry<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
-        sched->add_barrier();
-        sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
+    ccl::add_handle_exchange(sched, node_comm, in_buffers);
 
-        // TODO: no need barrier for the first iteration where ze_handle_exchange_entry exists
-        // TODO: think about the right way
-        coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
+    if (is_single_card) {
+        LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
+        if (comm->rank() == root) {
+            entry_factory::create<ze_onesided_reduce_entry>(
+                sched, send_buf, recv_buf, count, dtype, op, root, pair_comm);
+            sched->add_barrier();
+        }
+
+        ccl::add_comm_barrier(sched, pair_comm);
     }
     else {
-        entry_factory::make_entry<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
-    }
+        if (pair_comm->rank() == ccl::global_data::env().kernel_1s_lead) {
+            LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
+            entry_factory::create<ze_onesided_reduce_entry>(
+                sched, send_buf, tmp_buf, count, dtype, op, pair_comm->rank(), pair_comm);
+            sched->add_barrier();
 
-    sched->add_barrier();
+            size_t main_block_count = count / even_comm_size;
+            size_t block_count = main_block_count;
+            if (even_comm->rank() == even_comm_size - 1) {
+                block_count += count % even_comm_size;
+            }
 
-    if (comm->rank() == root) {
-        entry_factory::make_entry<ze_reduce_entry>(
-            sched, send_buf, recv_buf, count, dtype, reduction, root, comm);
-        sched->add_barrier();
-    }
+            ccl::add_comm_barrier(sched, even_comm);
+            size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
+            ccl_buffer partial_tmp_buf = tmp_buf + offset_bytes;
+            LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
+            std::vector<ze_event_handle_t> wait_events;
+            std::vector<size_t> block_counts(even_comm->size(), main_block_count);
+            block_counts[even_comm->size() - 1] = block_count;
+            entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
+                                                               tmp_buf,
+                                                               partial_tmp_buf,
+                                                               block_counts.data(),
+                                                               dtype,
+                                                               op,
+                                                               even_comm,
+                                                               wait_events,
+                                                               tmp_buf_idx);
+            sched->add_barrier();
+            ccl::add_comm_barrier(sched, even_comm);
+
+            CCL_THROW_IF_NOT(comm->size() % node_comm_size == 0);
+            int root_node_idx = root / node_comm_size;
+            ccl_buffer host_buf{};
+            if (!is_single_node && block_count) {
+                LOG_DEBUG("topo/scale_out: use host_reduce");
+                ccl::alloc_param alloc_param(
+                    block_count * dtype.size(), ccl::buffer_type::regular, ccl::buffer_place::host);
+                host_buf = sched->alloc_buffer(alloc_param);
+                entry_factory::create<copy_entry>(sched,
+                                                  partial_tmp_buf,
+                                                  host_buf,
+                                                  block_count,
+                                                  dtype,
+                                                  copy_attr(copy_direction::d2h));
+                sched->add_barrier();
 
-    // TODO: think about the right way
-    coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
+                LOG_DEBUG("rank: ",
+                          comm->rank(),
+                          ", reduce to rank on r2r_comm: ",
+                          root_node_idx,
+                          ", count: ",
+                          block_count);
+                ccl_coll_build_reduce(
+                    sched, host_buf, host_buf, block_count, dtype, op, root_node_idx, r2r_comm);
+                sched->add_barrier();
+            }
+
+            if (root_node_idx == r2r_comm->rank()) {
+                LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
+                int root_in_node_comm = node_comm->get_rank_from_global(root);
+                size_t offset_count = offset_bytes / dtype.size();
+                ccl_buffer src = (!is_single_node && block_count) ? host_buf : partial_tmp_buf;
+                ccl_buffer dst{};
+                copy_attr attr(root_in_node_comm,
+                               recv_buf_idx,
+                               copy_direction::h2d,
+                               node_comm,
+                               0,
+                               offset_count);
+                if (comm->rank() == root) {
+                    dst = recv_buf;
+                    attr = copy_attr(copy_direction::h2d, 0, offset_count);
+                }
+
+                LOG_DEBUG("rank: ",
+                          comm->rank(),
+                          ", copy to rank on node_comm: ",
+                          root_in_node_comm,
+                          ", offset count: ",
+                          offset_count,
+                          ", count: ",
+                          block_count);
+                entry_factory::create<copy_entry>(sched, src, dst, block_count, dtype, attr);
+                sched->add_barrier();
+            }
+        }
+        ccl::add_comm_barrier(sched, node_comm);
+    }
 
     return ccl::status::success;
 }
 
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/reduce_scatter.cpp b/src/coll/algorithms/reduce_scatter.cpp
index 4a6774ff7..08e84be57 100644
--- a/src/coll/algorithms/reduce_scatter.cpp
+++ b/src/coll/algorithms/reduce_scatter.cpp
@@ -21,7 +21,11 @@
  */
 
 #include "coll/algorithms/algorithms.hpp"
+#include "sched/entry/coll/coll_entry_helper.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+#include "coll/coll_util.hpp"
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_buffer send_buf,
@@ -32,7 +36,7 @@ ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_comm* comm) {
     LOG_DEBUG("build direct reduce_scatter");
 
-    entry_factory::make_entry<reduce_scatter_entry>(
+    entry_factory::create<reduce_scatter_entry>(
         sched, send_buf, recv_buf, recv_count, dtype, reduction, comm);
     return ccl::status::success;
 }
@@ -70,12 +74,12 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
 
     if (!inplace) {
         /* copy local data into recv_buf */
-        entry_factory::make_entry<copy_entry>(
+        entry_factory::create<copy_entry>(
             sched, send_buf + rank * recv_count * dtype_size, recv_buf, recv_count, dtype);
     }
 
     /* allocate temporary buffer to store incoming data */
-    ccl_buffer tmp_buf = sched->alloc_buffer(recv_count * dtype_size);
+    ccl_buffer tmp_buf = sched->alloc_buffer({ recv_count * dtype_size, recv_buf });
 
     for (idx = 1; idx < comm_size; idx++) {
         src = (comm_size + rank - idx) % comm_size;
@@ -84,39 +88,39 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
         /* send the data that dst needs. recv data that this process
          * needs from src into tmp_recvbuf */
         if (!inplace) {
-            entry_factory::make_entry<send_entry>(
+            entry_factory::create<send_entry>(
                 sched, send_buf + dst * recv_count * dtype_size, recv_count, dtype, dst, comm);
 
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
         }
         else {
-            entry_factory::make_entry<send_entry>(
+            entry_factory::create<send_entry>(
                 sched, recv_buf + dst * recv_count * dtype_size, recv_count, dtype, dst, comm);
 
-            entry_factory::make_entry<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
+            entry_factory::create<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
         }
 
         sched->add_barrier();
 
         if (!inplace) {
-            entry_factory::make_entry<reduce_local_entry>(
+            entry_factory::create<reduce_local_entry>(
                 sched, tmp_buf, recv_count, recv_buf, nullptr, dtype, op);
         }
         else {
-            entry_factory::make_entry<reduce_local_entry>(sched,
-                                                          tmp_buf,
-                                                          recv_count,
-                                                          recv_buf + rank * recv_count * dtype_size,
-                                                          nullptr,
-                                                          dtype,
-                                                          op);
+            entry_factory::create<reduce_local_entry>(sched,
+                                                      tmp_buf,
+                                                      recv_count,
+                                                      recv_buf + rank * recv_count * dtype_size,
+                                                      nullptr,
+                                                      dtype,
+                                                      op);
         }
     }
 
     /* if inplace, move output data to the beginning of
      * recv_buf. already done for rank 0 */
     if (inplace && (rank != 0)) {
-        entry_factory::make_entry<copy_entry>(
+        entry_factory::create<copy_entry>(
             sched, recv_buf + rank * recv_count * dtype_size, recv_buf, recv_count, dtype);
     }
 
@@ -127,7 +131,7 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
 ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
                                                ccl_buffer send_buf,
                                                ccl_buffer recv_buf,
-                                               size_t send_count,
+                                               size_t recv_count,
                                                const ccl_datatype& dtype,
                                                ccl::reduction op,
                                                ccl_comm* comm) {
@@ -151,7 +155,7 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
     int src = (comm_size + rank - 1) % comm_size;
     int dst = (comm_size + rank + 1) % comm_size;
 
-    size_t count = send_count;
+    size_t count = recv_count;
     size_t bytes = count * dtype_size;
 
     size_t chunk_count =
@@ -178,7 +182,7 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
 
     if (comm_size == 1) {
         if (!inplace) {
-            entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
+            entry_factory::create<copy_entry>(sched, send_buf, recv_buf, count, dtype);
             sched->add_barrier();
         }
         return ccl::status::success;
@@ -187,7 +191,7 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
     ccl_buffer tmp_buf;
 
     if (inplace) {
-        tmp_buf = sched->alloc_buffer(count * dtype_size);
+        tmp_buf = sched->alloc_buffer({ count * dtype_size, recv_buf });
     }
 
     ccl_buffer sbuf, rbuf;
@@ -208,7 +212,7 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
     size_t send_main_chunk_size, send_last_chunk_size;
     size_t recv_main_chunk_size, recv_last_chunk_size;
 
-    size_t send_chunk_size, recv_chunk_size, reduce_chunk_size;
+    size_t send_chunk_size, recv_chunk_size = 0, reduce_chunk_size;
     size_t send_chunk_offset, recv_chunk_offset = 0, reduce_chunk_offset;
 
     /* if chunk_count > 1 then make reduction with 1 chunk delay to get comp/comp overlapping */
@@ -278,33 +282,31 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
             recv_reduce_local_buf += reduce_chunk_offset;
             recv_reduce_comm_buf += reduce_chunk_offset;
 
-            entry_factory::make_entry<send_entry>(sched, sbuf, send_chunk_size, dtype, dst, comm);
+            entry_factory::create<send_entry>(sched, sbuf, send_chunk_size, dtype, dst, comm);
 
             if (!use_prev) {
                 CCL_ASSERT(recv_chunk_size == reduce_chunk_size);
-                entry_factory::make_entry<recv_reduce_entry>(sched,
-                                                             recv_reduce_local_buf,
-                                                             recv_chunk_size,
-                                                             nullptr, /* out_cnt */
-                                                             dtype,
-                                                             op,
-                                                             src,
-                                                             recv_reduce_comm_buf,
-                                                             comm,
-                                                             recv_reduce_result_type);
+                entry_factory::create<recv_reduce_entry>(sched,
+                                                         recv_reduce_local_buf,
+                                                         recv_chunk_size,
+                                                         dtype,
+                                                         op,
+                                                         src,
+                                                         comm,
+                                                         recv_reduce_comm_buf,
+                                                         recv_reduce_result_type);
             }
             else {
-                entry_factory::make_entry<recv_entry>(
-                    sched, rbuf, recv_chunk_size, dtype, src, comm);
+                entry_factory::create<recv_entry>(sched, rbuf, recv_chunk_size, dtype, src, comm);
 
                 if (idx + chunk_idx > 0) {
-                    entry_factory::make_entry<reduce_local_entry>(sched,
-                                                                  reduce_in_buf,
-                                                                  reduce_chunk_size,
-                                                                  reduce_inout_buf,
-                                                                  nullptr,
-                                                                  dtype,
-                                                                  op);
+                    entry_factory::create<reduce_local_entry>(sched,
+                                                              reduce_in_buf,
+                                                              reduce_chunk_size,
+                                                              reduce_inout_buf,
+                                                              nullptr,
+                                                              dtype,
+                                                              op);
                     sched->add_barrier();
                 }
 
@@ -324,13 +326,13 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
                     reduce_in_buf += recv_chunk_offset;
                     reduce_inout_buf += recv_chunk_offset;
 
-                    entry_factory::make_entry<reduce_local_entry>(sched,
-                                                                  reduce_in_buf,
-                                                                  recv_chunk_size,
-                                                                  reduce_inout_buf,
-                                                                  nullptr,
-                                                                  dtype,
-                                                                  op);
+                    entry_factory::create<reduce_local_entry>(sched,
+                                                              reduce_in_buf,
+                                                              recv_chunk_size,
+                                                              reduce_inout_buf,
+                                                              nullptr,
+                                                              dtype,
+                                                              op);
                 }
             }
 
@@ -343,3 +345,42 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
 
     return status;
 }
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+
+ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t recv_count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               ccl_comm* comm) {
+    LOG_DEBUG("build topo reduce_scatter, recv_count ", recv_count);
+
+    const std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers{
+        { send_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0
+    };
+
+    size_t send_buf_idx = 0;
+
+    ccl::add_handle_exchange(sched, comm, in_buffers);
+
+    std::vector<ze_event_handle_t> wait_events;
+    std::vector<size_t> blocks_count(comm->size(), recv_count);
+    entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
+                                                       send_buf,
+                                                       recv_buf,
+                                                       blocks_count.data(),
+                                                       dtype,
+                                                       reduction,
+                                                       comm,
+                                                       wait_events,
+                                                       send_buf_idx);
+    sched->add_barrier();
+
+    ccl::add_comm_barrier(sched, comm);
+
+    return ccl::status::success;
+}
+
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
index 32c341a7a..699c01f82 100644
--- a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
+++ b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
@@ -15,6 +15,7 @@
 */
 #include "oneapi/ccl/type_traits.hpp"
 #include "coll/algorithms/sparse_allreduce/sparse_handler.hpp"
+#include "common/utils/memcpy.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
 #define CCL_COALESCE_RESERVE_SIZE 16
@@ -163,7 +164,7 @@
         param_nnz.dtype = ccl_datatype_int8; \
         param_nnz.comm = comm; \
 \
-        entry_factory::make_entry<coll_entry>(sched, param_nnz); \
+        entry_factory::create<coll_entry>(sched, param_nnz); \
         sched->add_barrier(); \
     } while (0)
 
@@ -357,10 +358,8 @@ ccl::status sparse_reduce_ring(const void* ctx) {
         std::vector<v_type> buf_v(merge_idx_len * sa_hndl->val_dim_cnt);
 
         /* copy what we already have reduced*/
-        ccl_comp_copy(
-            snd_i, buf_i.data(), sa_hndl->itype_size * sa_hndl->dst_count[0], ccl_datatype_int8);
-        ccl_comp_copy(
-            snd_v, buf_v.data(), sa_hndl->vtype_size * sa_hndl->dst_count[1], ccl_datatype_int8);
+        ccl_comp_copy(snd_i, buf_i.data(), sa_hndl->itype_size * sa_hndl->dst_count[0]);
+        ccl_comp_copy(snd_v, buf_v.data(), sa_hndl->vtype_size * sa_hndl->dst_count[1]);
 
         size_t idx_offset = 0;
         for (auto id : unique_indices_ids) {
@@ -389,15 +388,12 @@ ccl::status sparse_reduce_ring(const void* ctx) {
                                 new_dst_size))
                                .get_ptr();
 
-        ccl_comp_copy(buf_i.data(),
-                      (i_type*)(sa_hndl->dst_buf),
-                      sa_hndl->itype_size * merge_idx_len,
-                      ccl_datatype_int8);
+        ccl_comp_copy(
+            buf_i.data(), (i_type*)(sa_hndl->dst_buf), sa_hndl->itype_size * merge_idx_len);
 
         ccl_comp_copy(buf_v.data(),
                       (v_type*)((char*)(sa_hndl->dst_buf) + sa_hndl->itype_size * merge_idx_len),
-                      sa_hndl->vtype_size * merge_idx_len * sa_hndl->val_dim_cnt,
-                      ccl_datatype_int8);
+                      sa_hndl->vtype_size * merge_idx_len * sa_hndl->val_dim_cnt);
 
         sa_hndl->dst_count[0] = merge_idx_len;
         sa_hndl->dst_count[1] = merge_idx_len * sa_hndl->val_dim_cnt;
@@ -406,8 +402,7 @@ ccl::status sparse_reduce_ring(const void* ctx) {
 
     ccl_comp_copy(sa_hndl->recv_buf,
                   sa_hndl->send_tmp_buf,
-                  idx_size + sa_hndl->send_count[1] * sa_hndl->vtype_size,
-                  ccl_datatype_int8);
+                  idx_size + sa_hndl->send_count[1] * sa_hndl->vtype_size);
 
     sa_hndl->iter++;
 
@@ -489,7 +484,7 @@ ccl::status sparse_set_max_buf_size_ring(const void* ctx) {
     size_t max_size = max_nnz * common_size_part;
 
     sa_hndl->send_tmp_buf = sa_hndl->sched->alloc_buffer(max_size).get_ptr();
-    CCL_MEMCPY(sa_hndl->send_tmp_buf, sa_hndl->dst_buf, sa_hndl->dst_count[0] * common_size_part);
+    ccl::memcpy(sa_hndl->send_tmp_buf, sa_hndl->dst_buf, sa_hndl->dst_count[0] * common_size_part);
     sa_hndl->recv_buf = sa_hndl->sched->alloc_buffer(max_size).get_ptr();
 
     return ccl::status::success;
@@ -505,7 +500,7 @@ ccl::status sparse_coalesce_ring(const void* ctx) {
 
     sa_hndl->send_count[0] = iv_map_cnt; /* index count */
     sa_hndl->send_count[1] = iv_map_cnt * sa_hndl->val_dim_cnt; /* value count */
-    CCL_MEMCPY(&sa_hndl->dst_count, &sa_hndl->send_count, sizeof(size_t) * 2);
+    ccl::memcpy(&sa_hndl->dst_count, &sa_hndl->send_count, sizeof(size_t) * 2);
 
     CCL_SPARSE_ALLREDUCE_IF_SINGLE_RANK();
     return ccl::status::success;
@@ -559,37 +554,37 @@ ccl::status ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
     sa_hndl->recv_counts =
         static_cast<size_t*>(sched->alloc_buffer(sizeof(size_t) * comm_size).get_ptr());
 
-    entry_factory::make_entry<function_entry>(sched, sparse_coalesce_ring<i_type, v_type>, sa_hndl);
+    entry_factory::create<function_entry>(sched, sparse_coalesce_ring<i_type, v_type>, sa_hndl);
     sched->add_barrier();
 
     if (comm_size > 1) {
         CCL_SPARSE_ALLREDUCE_ADD_NNZ_ENTRY();
 
-        entry_factory::make_entry<function_entry>(sched, sparse_set_max_buf_size_ring, sa_hndl);
+        entry_factory::create<function_entry>(sched, sparse_set_max_buf_size_ring, sa_hndl);
         sched->add_barrier();
 
         for (int i = 0; i < comm_size - 1; i++) {
             /* send local data to the right neighbour */
-            send_entry* se = entry_factory::make_entry<send_entry>(
+            send_entry* se = entry_factory::create<send_entry>(
                 sched, ccl_buffer(), 0, ccl_datatype_int8, send_to, comm);
             se->set_field_fn<ccl_sched_entry_field_buf>(sparse_get_send_buf_ring, sa_hndl);
             se->set_field_fn<ccl_sched_entry_field_cnt>(sparse_get_send_count_ring, sa_hndl);
 
             /* receive data from the left neighbour */
-            recv_entry* re = entry_factory::make_entry<recv_entry>(
+            recv_entry* re = entry_factory::create<recv_entry>(
                 sched, ccl_buffer(), 0, ccl_datatype_int8, recv_from, comm);
             re->set_field_fn<ccl_sched_entry_field_buf>(sparse_get_recv_buf_ring, sa_hndl);
             re->set_field_fn<ccl_sched_entry_field_cnt>(sparse_get_recv_count_ring, sa_hndl);
             sched->add_barrier();
 
             /* reduce data */
-            entry_factory::make_entry<function_entry>(
+            entry_factory::create<function_entry>(
                 sched, sparse_reduce_ring<i_type, v_type>, sa_hndl);
             sched->add_barrier();
         }
 
         /* copy all reduced data to recv_buf */
-        entry_factory::make_entry<function_entry>(
+        entry_factory::create<function_entry>(
             sched, sparse_prepare_result_ring<i_type, v_type>, sa_hndl);
         sched->add_barrier();
     }
@@ -626,9 +621,9 @@ ccl::status sparse_create_matrix_mask(const void* ctx) {
         auto elem = sa_hndl->iv_map->find(*it);
         if (elem != sa_hndl->iv_map->end()) {
             /* copy values from dst_buf to matrix */
-            CCL_MEMCPY(matrix + idx_offset * sa_hndl->val_dim_cnt,
-                       values + elem->second[0],
-                       value_line_size);
+            ccl::memcpy(matrix + idx_offset * sa_hndl->val_dim_cnt,
+                        values + elem->second[0],
+                        value_line_size);
         }
         else {
             /* no index was found locally, fill the line with mask */
@@ -647,10 +642,7 @@ ccl::status sparse_create_matrix_mask(const void* ctx) {
                                           sa_hndl->vtype_size * sa_hndl->dst_count[1])
             .get_ptr();
 
-    ccl_comp_copy(matrix,
-                  (char*)sa_hndl->dst_buf + idx_cnt * sa_hndl->itype_size,
-                  matrix_size,
-                  ccl_datatype_int8);
+    ccl_comp_copy(matrix, (char*)sa_hndl->dst_buf + idx_cnt * sa_hndl->itype_size, matrix_size);
 
     CCL_FREE(matrix);
     sa_hndl->iv_map->clear();
@@ -765,13 +757,13 @@ ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
     sa_hndl->recv_counts =
         static_cast<size_t*>(sched->alloc_buffer(sizeof(size_t) * comm_size).get_ptr());
 
-    entry_factory::make_entry<function_entry>(sched, sparse_coalesce_mask<i_type, v_type>, sa_hndl);
+    entry_factory::create<function_entry>(sched, sparse_coalesce_mask<i_type, v_type>, sa_hndl);
     sched->add_barrier();
 
     if (comm_size > 1) {
         CCL_SPARSE_ALLREDUCE_ADD_NNZ_ENTRY();
 
-        entry_factory::make_entry<function_entry>(sched, sparse_nnz_per_rank_mask, sa_hndl);
+        entry_factory::create<function_entry>(sched, sparse_nnz_per_rank_mask, sa_hndl);
         sched->add_barrier();
 
         ccl_coll_entry_param param_allgatherv{};
@@ -784,13 +776,13 @@ ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
         param_allgatherv.comm = comm;
 
         /* gather indices from all the processes */
-        coll_entry* e = entry_factory::make_entry<coll_entry>(sched, param_allgatherv);
+        coll_entry* e = entry_factory::create<coll_entry>(sched, param_allgatherv);
         e->set_field_fn<ccl_sched_entry_field_send_buf>(sparse_get_send_buf_mask, sa_hndl);
         e->set_field_fn<ccl_sched_entry_field_recv_buf>(sparse_get_allgatherv_buf_mask, sa_hndl);
         e->set_field_fn<ccl_sched_entry_field_send_count>(sparse_get_send_count_mask, sa_hndl);
         sched->add_barrier();
 
-        entry_factory::make_entry<function_entry>(
+        entry_factory::create<function_entry>(
             sched, sparse_create_matrix_mask<i_type, v_type>, sa_hndl);
         sched->add_barrier();
 
@@ -804,7 +796,7 @@ ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
         param_allreduce.comm = comm;
 
         /* coll allreduce on matrix data */
-        coll_entry* ce = entry_factory::make_entry<coll_entry>(sched, param_allreduce);
+        coll_entry* ce = entry_factory::create<coll_entry>(sched, param_allreduce);
         ce->set_field_fn<ccl_sched_entry_field_send_buf>(sparse_get_allreduce_buf_mask, sa_hndl);
         ce->set_field_fn<ccl_sched_entry_field_recv_buf>(sparse_get_allreduce_buf_mask, sa_hndl);
         ce->set_field_fn<ccl_sched_entry_field_cnt>(sparse_get_allreduce_count_mask, sa_hndl);
@@ -1085,7 +1077,7 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
               sa_hndl->recv_counts);
 
     if (sched->coll_attr.sparse_coalesce_mode != ccl::sparse_coalesce_mode::disable) {
-        entry_factory::make_entry<function_entry>(
+        entry_factory::create<function_entry>(
             sched, sparse_coalesce_allgatherv<i_type, v_type>, sa_hndl);
         sched->add_barrier();
 
@@ -1099,7 +1091,7 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
 
     CCL_SPARSE_ALLREDUCE_ADD_NNZ_ENTRY();
 
-    entry_factory::make_entry<function_entry>(sched, sparse_alloc_result_buf_allgatherv, sa_hndl);
+    entry_factory::create<function_entry>(sched, sparse_alloc_result_buf_allgatherv, sa_hndl);
     sched->add_barrier();
 
     // allgather indices
@@ -1113,12 +1105,12 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
     param_i.dtype = index_dtype;
     param_i.comm = comm;
 
-    coll_entry* ce = entry_factory::make_entry<coll_entry>(sched, param_i, parallel_request_index);
+    coll_entry* ce = entry_factory::create<coll_entry>(sched, param_i, parallel_request_index);
     ce->set_field_fn<ccl_sched_entry_field_send_buf>(sparse_get_i_send_allgatherv, sa_hndl);
     ce->set_field_fn<ccl_sched_entry_field_recv_buf>(sparse_get_i_recv_allgatherv, sa_hndl);
     ce->set_field_fn<ccl_sched_entry_field_send_count>(sparse_get_send_count_allgatherv<0>,
                                                        sa_hndl);
-    entry_factory::make_entry<function_entry>(sched, sparse_set_v_counts_allgatherv<1>, sa_hndl);
+    entry_factory::create<function_entry>(sched, sparse_set_v_counts_allgatherv<1>, sa_hndl);
 
     // allgather values
     parallel_request_index++;
@@ -1131,7 +1123,7 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
     param_v.dtype = value_dtype;
     param_v.comm = comm;
 
-    ce = entry_factory::make_entry<coll_entry>(sched, param_v, parallel_request_index);
+    ce = entry_factory::create<coll_entry>(sched, param_v, parallel_request_index);
     ce->set_field_fn<ccl_sched_entry_field_send_buf>(sparse_get_v_send_allgatherv, sa_hndl);
     ce->set_field_fn<ccl_sched_entry_field_recv_buf>(sparse_get_v_recv_allgatherv, sa_hndl);
     ce->set_field_fn<ccl_sched_entry_field_send_count>(sparse_get_send_count_allgatherv<1>,
@@ -1139,11 +1131,10 @@ ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
     sched->add_barrier();
 
     if (sched->coll_attr.sparse_coalesce_mode == ccl::sparse_coalesce_mode::disable) {
-        entry_factory::make_entry<function_entry>(
-            sched, sparse_return_gathered_allgatherv, sa_hndl);
+        entry_factory::create<function_entry>(sched, sparse_return_gathered_allgatherv, sa_hndl);
     }
     else {
-        entry_factory::make_entry<function_entry>(
+        entry_factory::create<function_entry>(
             sched, sparse_reduce_gathered_allgatherv<i_type, v_type>, sa_hndl);
     }
     sched->add_barrier();
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index 0dca06c03..8af985d79 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -51,7 +51,7 @@
 #include "common/global/global.hpp"
 
 #include "coll/algorithms/algorithms.hpp"
-#include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
 #include "coll/algorithms/allreduce/allreduce_2d.hpp"
 #include "coll/algorithms/sparse_allreduce/sparse_allreduce.hpp"
 #include "coll/selection/selection.hpp"
@@ -63,6 +63,13 @@
 static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr& in_attr) {
     ccl_coll_attr& attr = const_cast<ccl_coll_attr&>(in_attr);
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    uint64_t operation_create_time = 0;
+    if (ccl::global_data::env().enable_kernel_profile && param.stream) {
+        operation_create_time = ccl::ze::calculate_global_time(param.stream->get_ze_device());
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 #ifdef CCL_ENABLE_SYCL
     if (ccl::global_data::env().enable_op_sync)
         attr.synchronous = 1;
@@ -85,8 +92,9 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     bool postpone_schedule = false;
     if (ccl::global_data::env().enable_unordered_coll) {
         if (!attr.match_id.empty()) {
-            auto comm =
-                param.comm->unordered_coll_manager->get_comm(std::string(attr.match_id)).get();
+            auto comm = param.comm->get_unordered_coll_manager()
+                            ->get_comm(std::string(attr.match_id))
+                            .get();
             if (!comm) {
                 if (attr.synchronous) {
                     CCL_THROW("unsupported collective (synchronous && unordered && !communicator)");
@@ -107,6 +115,12 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     /* 2. create or get schedule */
     ccl_master_sched* sched = ccl_master_sched::create(param, attr);
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (ccl::global_data::env().enable_kernel_profile && param.stream) {
+        sched->get_kernel_timer().set_operation_create_time(operation_create_time);
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
     /* 3. fuse schedule */
     if (!postpone_schedule && ccl::global_data::env().enable_fusion) {
         if (data.fusion_manager->add(sched)) {
@@ -128,7 +142,7 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
             user has provided match_id that has not been resolved yet.
             schedule will be postponed until comm resolution
         */
-        return param.comm->unordered_coll_manager->postpone(sched);
+        return param.comm->get_unordered_coll_manager()->postpone(sched);
     }
 
     /* 6. regular schedule execution */
@@ -138,6 +152,13 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
         request = nullptr;
     }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (ccl::global_data::env().enable_kernel_profile && sched->coll_param.stream) {
+        sched->get_kernel_timer().set_operation_start_time(
+            ccl::ze::calculate_global_time(sched->coll_param.stream->get_ze_device()));
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
     return request;
 }
 
@@ -155,7 +176,12 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
     param.recv_counts = recv_counts;
     param.dtype = dtype;
     param.comm = comm;
+    param.stream = sched->coll_param.stream;
+    param.buf = send_buf.get_ptr();
     param.is_vector_buf = sched->coll_attr.is_vector_buf;
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_allgatherv>(param);
@@ -173,6 +199,12 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
             CCL_CALL(ccl_coll_build_ring_allgatherv(
                 sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm));
             break;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_allgatherv_topo:
+            CCL_CALL(ccl_coll_build_topo_allgatherv(
+                sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm));
+            break;
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         default:
             CCL_FATAL("unexpected allgatherv_algo ", ccl_coll_algorithm_to_str(algo));
             return ccl::status::invalid_arguments;
@@ -214,8 +246,8 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
             CCL_CALL(ccl_coll_build_rabenseifner_allreduce(
                 sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
-        case ccl_coll_allreduce_starlike:
-            CCL_CALL(ccl_coll_build_starlike_allreduce(
+        case ccl_coll_allreduce_nreduce:
+            CCL_CALL(ccl_coll_build_nreduce_allreduce(
                 sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
         case ccl_coll_allreduce_ring:
@@ -242,15 +274,15 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                 sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
         case ccl_coll_allreduce_2d:
-            CCL_CALL(comm->allreduce_2d_builder->build(
+            CCL_CALL(comm->get_allreduce_2d_builder()->build(
                 sched, send_buf, recv_buf, count, dtype, reduction));
             break;
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-        case ccl_coll_allreduce_topo_ring:
-            CCL_CALL(ccl_coll_build_gpu_allreduce(
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_allreduce_topo:
+            CCL_CALL(ccl_coll_build_topo_allreduce(
                 sched, send_buf, recv_buf, count, dtype, reduction, comm));
             break;
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         default:
             CCL_FATAL("unexpected allreduce_algo ", ccl_coll_algorithm_to_str(algo));
             return ccl::status::invalid_arguments;
@@ -272,6 +304,10 @@ ccl::status ccl_coll_build_alltoall(ccl_sched* sched,
     param.count = count;
     param.dtype = dtype;
     param.comm = comm;
+    param.stream = sched->coll_param.stream;
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_alltoall>(param);
@@ -301,6 +337,10 @@ ccl::status ccl_coll_build_alltoallv(ccl_sched* sched,
     param.ctype = ccl_coll_alltoallv;
     param.dtype = dtype;
     param.comm = comm;
+    param.stream = sched->coll_param.stream;
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_alltoallv>(param);
@@ -357,6 +397,7 @@ ccl::status ccl_coll_build_bcast(ccl_sched* sched,
     param.dtype = dtype;
     param.comm = comm;
     param.stream = sched->coll_param.stream;
+    param.buf = buf.get_ptr();
 #ifdef CCL_ENABLE_SYCL
     param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
@@ -387,11 +428,11 @@ ccl::status ccl_coll_build_bcast(ccl_sched* sched,
         case ccl_coll_bcast_naive:
             CCL_CALL(ccl_coll_build_naive_bcast(sched, buf, count, dtype, root, comm));
             break;
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-        case ccl_coll_bcast_topo_ring:
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_bcast_topo:
             CCL_CALL(ccl_coll_build_gpu_bcast(sched, buf, count, dtype, root, comm));
             break;
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         default:
             CCL_FATAL("unexpected bcast_algo ", ccl_coll_algorithm_to_str(algo));
             return ccl::status::invalid_arguments;
@@ -408,6 +449,7 @@ ccl::status ccl_coll_build_reduce(ccl_sched* sched,
                                   int root,
                                   ccl_comm* comm) {
     ccl::status status = ccl::status::success;
+    CCL_THROW_IF_NOT(root >= 0 && root < comm->size(), "wrong root");
 
     ccl_selector_param param;
     param.ctype = ccl_coll_reduce;
@@ -415,6 +457,10 @@ ccl::status ccl_coll_build_reduce(ccl_sched* sched,
     param.dtype = dtype;
     param.comm = comm;
     param.stream = sched->coll_param.stream;
+    param.buf = send_buf.get_ptr();
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_reduce>(param);
@@ -444,12 +490,12 @@ ccl::status ccl_coll_build_reduce(ccl_sched* sched,
                 root == 0 ? comm->dtree() : comm->dtree().copy_with_new_root(root),
                 comm));
             break;
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-        case ccl_coll_reduce_topo_ring:
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_reduce_topo:
             CCL_CALL(ccl_coll_build_gpu_reduce(
                 sched, send_buf, recv_buf, count, dtype, reduction, root, comm));
             break;
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         default:
             CCL_FATAL("unexpected reduce_algo ", ccl_coll_algorithm_to_str(algo));
             return ccl::status::invalid_arguments;
@@ -473,6 +519,11 @@ ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
     param.count = count;
     param.dtype = dtype;
     param.comm = comm;
+    param.stream = sched->coll_param.stream;
+    param.buf = send_buf.get_ptr();
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_reduce_scatter>(param);
@@ -494,6 +545,12 @@ ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
                     sched, send_buf, recv_buf, count, dtype, reduction, comm));
             }
             break;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_reduce_scatter_topo:
+            CCL_CALL(ccl_coll_build_topo_reduce_scatter(
+                sched, send_buf, recv_buf, count, dtype, reduction, comm));
+            break;
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         default:
             CCL_FATAL("unexpected reduce_scatter_algo ", ccl_coll_algorithm_to_str(algo));
             return ccl::status::invalid_arguments;
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index b455eefe7..e38d9941a 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -15,11 +15,11 @@
 */
 #pragma once
 
-#include "coll/algorithms/algorithms_enum.hpp"
-#include "common/comm/comm.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
 #include "coll/coll_param.hpp"
-#include "common/stream/stream.hpp"
+#include "common/comm/comm.hpp"
 #include "common/datatype/datatype.hpp"
+#include "common/stream/stream.hpp"
 #include "common/utils/buffer.hpp"
 
 #include "coll/coll_common_attributes.hpp"
diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp
index d3cb9b54b..6483b0625 100644
--- a/src/coll/coll_param.cpp
+++ b/src/coll/coll_param.cpp
@@ -17,6 +17,7 @@
 
 #include "coll/coll_param.hpp"
 #include "common/global/global.hpp"
+#include "common/utils/sycl_utils.hpp"
 
 #define COPY_COMMON_OP_ATTRS(from, to) \
     to->prologue_fn = nullptr; /*from.get<ccl::operation_attr_id::prologue_fn>().get();*/ \
@@ -74,20 +75,6 @@ ccl_coll_attr::ccl_coll_attr(const ccl::sparse_allreduce_attr& attr) {
     sparse_coalesce_mode = attr.get<ccl::sparse_allreduce_attr_id::coalesce_mode>();
 }
 
-bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs) {
-    CCL_ASSERT((lhs.is_reduction() && rhs.is_reduction()) ||
-               (!lhs.is_reduction() && !rhs.is_reduction()));
-
-    bool res =
-        lhs.get_coll_type() == rhs.get_coll_type() && lhs.get_datatype() == rhs.get_datatype();
-
-    if (lhs.is_reduction()) {
-        res = res && (lhs.get_reduction() == rhs.get_reduction());
-    }
-
-    return res;
-}
-
 std::string ccl_coll_attr::to_string() const {
     std::stringstream ss;
 
@@ -144,14 +131,19 @@ std::string ccl_coll_param::to_string() const {
     ss << "{ ";
     ss << "coll: " << ccl_coll_type_to_str(ctype);
 
-    if (!send_bufs.empty())
-        ss << ", sb: " << get_send_buf() << ", sc: " << get_send_count();
+    if (!send_bufs.empty()) {
+        ss << ", sb: " << get_send_buf()
+           << ", sc: " << std::accumulate(send_counts.begin(), send_counts.end(), 0);
+    }
 
-    if (!recv_bufs.empty())
-        ss << ", rb: " << get_recv_buf() << ", rc: " << get_recv_count();
+    if (!recv_bufs.empty()) {
+        ss << ", rb: " << get_recv_buf()
+           << ", rc: " << std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+    }
 
-    if (ctype != ccl_coll_barrier)
+    if (ctype != ccl_coll_barrier) {
         ss << ", dt: " << ccl::global_data::get().dtypes->name(dtype);
+    }
 
     if (ctype == ccl_coll_allreduce || ctype == ccl_coll_reduce ||
         ctype == ccl_coll_reduce_scatter) {
@@ -223,7 +215,15 @@ bool ccl_coll_param::is_inplace(buf_type type) const {
     }
 
     void* send_buf = get_send_buf(0, type);
-    void* recv_buf = get_recv_buf(0, type);
+    void* recv_buf = nullptr;
+
+    if ((ctype == ccl_coll_allgatherv) && (recv_bufs.size() > 1)) {
+        recv_buf = get_recv_buf(comm->rank(), type);
+    }
+    else {
+        recv_buf = get_recv_buf(0, type);
+    }
+
     return (send_buf && (send_buf == recv_buf)) ? true : false;
 }
 
@@ -468,7 +468,7 @@ void ccl_coll_param::sync_deps(const ccl_stream* s, const std::vector<ccl::event
         // do anything and just return an empty event as opposed to submit_barrier without paramers
         // which submits a full queue barrier. And there is a bug which leads to a crash if
         // empty sycl event is passed to the function.
-        auto sycl_ev = s->get_native_stream().submit_barrier();
+        auto sycl_ev = ccl::utils::submit_barrier(s->get_native_stream());
         auto e = ccl::create_event(sycl_ev);
         copy_deps(ds, &e);
         return;
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index 049f5c56a..31e8b4731 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
 #include "common/datatype/datatype.hpp"
 #include "oneapi/ccl.hpp"
 
@@ -228,44 +228,3 @@ struct ccl_coll_param {
                                                       const ccl_stream* stream,
                                                       const std::vector<ccl::event>& deps = {});
 };
-
-class coll_param_gpu {
-    ccl_coll_type ctype;
-    ccl::datatype dtype;
-    ccl::reduction red;
-
-public:
-    coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype, ccl::reduction red)
-            : ctype{ ctype },
-              dtype{ dtype },
-              red{ red } {}
-
-    coll_param_gpu(ccl_coll_type ctype, ccl::datatype dtype)
-            : ctype{ ctype },
-              dtype{ dtype },
-              red{ (ccl::reduction)-1 } {
-        assert(!is_reduction() && "This constructor is invalid for reduction types");
-    }
-
-    ccl_coll_type get_coll_type() const {
-        return ctype;
-    }
-
-    ccl::datatype get_datatype() const {
-        return dtype;
-    }
-
-    bool is_reduction() const {
-        return ccl_coll_type_is_reduction(get_coll_type());
-    }
-
-    ccl::reduction get_reduction() const {
-        if (!is_reduction()) {
-            throw ccl::exception(
-                "get_ruduction(): is not supported for non-reduction collective type, i.e. bcast");
-        }
-        return red;
-    }
-};
-
-bool operator==(const coll_param_gpu& lhs, const coll_param_gpu& rhs);
diff --git a/src/coll/coll_util.cpp b/src/coll/coll_util.cpp
new file mode 100644
index 000000000..679260dfa
--- /dev/null
+++ b/src/coll/coll_util.cpp
@@ -0,0 +1,104 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll_util.hpp"
+
+#include "sched/entry/coll/coll_entry_helper.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+#include "sched/entry/ze/ze_event_signal_entry.hpp"
+#include "sched/entry/ze/ze_event_wait_entry.hpp"
+
+namespace ccl {
+
+void add_wait_events(ccl_sched* sched, const std::vector<ze_event_handle_t>& wait_events) {
+    if (wait_events.size() > 0) {
+        entry_factory::create<ze_event_wait_entry>(sched, wait_events);
+        sched->add_barrier();
+    }
+}
+
+void add_signal_event(ccl_sched* sched, ze_event_handle_t signal_event) {
+    if (signal_event) {
+        entry_factory::create<ze_event_signal_entry>(sched, signal_event);
+        sched->add_barrier();
+    }
+}
+
+ze_event_handle_t add_signal_event(ccl_sched* sched) {
+    auto signal_event = sched->get_memory().event_manager->create();
+    add_signal_event(sched, signal_event);
+    return signal_event;
+}
+
+void add_comm_barrier(ccl_sched* sched,
+                      ccl_comm* comm,
+                      ze_event_pool_handle_t ipc_pool,
+                      size_t ipc_event_idx) {
+    if (ipc_pool && global_data::env().enable_ze_barrier) {
+        entry_factory::create<ze_barrier_entry>(sched, comm, ipc_pool, ipc_event_idx);
+    }
+    else {
+        ccl_coll_entry_param barrier_param{};
+        barrier_param.ctype = ccl_coll_barrier;
+        barrier_param.comm = comm;
+
+        /* TODO: optimize p2p based barrier */
+        //barrier_param.hint_algo.barrier = ccl_coll_barrier_ring;
+
+        coll_entry_helper::add_coll_entry<ccl_coll_barrier>(sched, barrier_param);
+    }
+    sched->add_barrier();
+}
+
+ze_event_handle_t add_comm_barrier(ccl_sched* sched,
+                                   ccl_comm* comm,
+                                   const std::vector<ze_event_handle_t>& wait_events,
+                                   ze_event_pool_handle_t ipc_pool,
+                                   size_t ipc_event_idx) {
+    auto signal_event = sched->get_memory().event_manager->create();
+    if (sched->get_memory().use_single_list) {
+        add_wait_events(sched, wait_events);
+        add_comm_barrier(sched, comm, ipc_pool, ipc_event_idx);
+        add_signal_event(sched, signal_event);
+    }
+    else {
+        add_comm_barrier(sched, comm, ipc_pool, ipc_event_idx);
+        add_signal_event(sched, signal_event);
+    }
+    return signal_event;
+}
+
+void add_handle_exchange(ccl_sched* sched,
+                         ccl_comm* comm,
+                         const std::vector<ze_handle_exchange_entry::mem_desc_t>& in_buffers,
+                         int skip_rank,
+                         ze_event_pool_handle_t pool,
+                         size_t event_idx) {
+    if (sched->coll_attr.to_cache) {
+        sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
+        sched->add_barrier();
+        sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
+
+        // TODO: no need barrier for the first iteration where ze_handle_exchange_entry exists
+        add_comm_barrier(sched, comm, pool, event_idx);
+    }
+    else {
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
+        sched->add_barrier();
+    }
+}
+
+} // namespace ccl
diff --git a/src/coll/coll_util.hpp b/src/coll/coll_util.hpp
new file mode 100644
index 000000000..52f52bab6
--- /dev/null
+++ b/src/coll/coll_util.hpp
@@ -0,0 +1,45 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/global/global.hpp"
+#include "sched/entry/ze/ze_handle_exchange_entry.hpp"
+
+namespace ccl {
+
+void add_wait_events(ccl_sched* sched, const std::vector<ze_event_handle_t>& wait_events);
+void add_signal_event(ccl_sched* sched, ze_event_handle_t signal_event);
+ze_event_handle_t add_signal_event(ccl_sched* sched);
+
+void add_comm_barrier(ccl_sched* sched,
+                      ccl_comm* comm,
+                      ze_event_pool_handle_t ipc_pool = {},
+                      size_t ipc_event_idx = 0);
+
+ze_event_handle_t add_comm_barrier(ccl_sched* sched,
+                                   ccl_comm* comm,
+                                   const std::vector<ze_event_handle_t>& wait_events,
+                                   ze_event_pool_handle_t ipc_pool = {},
+                                   size_t ipc_event_idx = 0);
+
+void add_handle_exchange(ccl_sched* sched,
+                         ccl_comm* comm,
+                         const std::vector<ze_handle_exchange_entry::mem_desc_t>& in_buffers,
+                         int skip_rank = ccl_comm::invalid_rank,
+                         ze_event_pool_handle_t pool = nullptr,
+                         size_t event_idx = 0);
+
+} // namespace ccl
diff --git a/src/coll/selection/selection.cpp b/src/coll/selection/selection.cpp
index a90c83600..48ca3d811 100644
--- a/src/coll/selection/selection.cpp
+++ b/src/coll/selection/selection.cpp
@@ -14,8 +14,53 @@
  limitations under the License.
 */
 #include "coll/selection/selection.hpp"
+#include "common/comm/comm.hpp"
 #include "common/global/global.hpp"
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include <CL/sycl/backend_types.hpp>
+#include "common/utils/sycl_utils.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+std::string to_string(const ccl_selector_param& param) {
+    std::stringstream ss;
+
+    ss << "{ "
+       << "coll: " << ccl_coll_type_to_str(param.ctype) << ", count: " << param.count
+       << ", dt: " << ccl::global_data::get().dtypes->name(param.dtype);
+
+    if (param.comm) {
+        ss << ", comm: { rank: " << param.comm->rank() << ", size: " << param.comm->size() << " }";
+    }
+
+    if (param.stream) {
+        ss << ", stream: " << param.stream->to_string();
+    }
+
+    if (param.buf) {
+        ss << ", buf: " << param.buf;
+    }
+
+    if (param.is_vector_buf) {
+        ss << ", vector_buf";
+    }
+
+#ifdef CCL_ENABLE_SYCL
+    if (param.is_sycl_buf) {
+        ss << ", sycl_buf";
+    }
+#endif // CCL_ENABLE_SYCL
+
+    if (param.hint_algo.has_value()) {
+        ss << ", hint_algo: " << param.hint_algo.value;
+    }
+
+    ss << " }";
+
+    return ss.str();
+}
+
 bool ccl_is_direct_algo(const ccl_selector_param& param) {
     bool res = false;
 
@@ -49,22 +94,195 @@ bool ccl_is_direct_algo(const ccl_selector_param& param) {
     return res;
 }
 
+namespace checkers {
+
+bool is_family1_card(const ccl_selector_param& param) {
+    if (param.stream) {
+        return param.stream->get_device_family() == ccl::device_family::family1;
+    }
+    return false;
+}
+
+bool is_coll_supported(std::initializer_list<ccl_coll_type> colls, ccl_coll_type value) {
+    return std::find(colls.begin(), colls.end(), value) != colls.end();
+}
+
+bool is_sycl_buf(const ccl_selector_param& param) {
+#ifdef CCL_ENABLE_SYCL
+    return param.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+    return false;
+}
+
+bool is_device_buf(const ccl_selector_param& param) {
+#ifdef CCL_ENABLE_SYCL
+    if (param.buf && param.stream) {
+        auto ctx = param.stream->get_native_stream().get_context();
+        return sycl::get_pointer_type(param.buf, ctx) == sycl::usm::alloc::device;
+    }
+#endif // CCL_ENABLE_SYCL
+    return true;
+}
+
+bool is_l0_backend(const ccl_selector_param& param) {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (param.stream) {
+        return param.stream->get_backend() == ccl::utils::get_level_zero_backend();
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    return false;
+}
+
+bool is_gpu_stream(const ccl_selector_param& param) {
+    if (param.stream) {
+        return param.stream->is_gpu();
+    }
+    return false;
+}
+
+bool is_single_node(const ccl_selector_param& param) {
+    size_t local_proc_count = ccl::global_data::get().executor->get_local_proc_count();
+    return static_cast<size_t>(param.comm->size()) == local_proc_count;
+}
+
+bool is_single_card(const ccl_selector_param& param) {
+    return (param.comm->size() == 2) && is_single_node(param);
+}
+
+} // namespace checkers
+
+#define RETURN_FALSE_IF(cond, ...) \
+    do { \
+        if (cond) { \
+            LOG_DEBUG("selection checker: ", ##__VA_ARGS__); \
+            return false; \
+        } \
+    } while (0)
+
 static bool ccl_is_device_side_algo(ccl_coll_algo algo, const ccl_selector_param& param) {
-    if (param.ctype == ccl_coll_allreduce) {
-        return algo.allreduce == ccl_coll_allreduce_topo_ring;
+    CCL_THROW_IF_NOT(algo.has_value(), "empty algo value");
+
+    if (param.ctype == ccl_coll_allgatherv) {
+        return algo.allgatherv == ccl_coll_allgatherv_topo;
     }
-    else if (param.ctype == ccl_coll_reduce) {
-        return algo.reduce == ccl_coll_reduce_topo_ring;
+    else if (param.ctype == ccl_coll_allreduce) {
+        return algo.allreduce == ccl_coll_allreduce_topo;
     }
     else if (param.ctype == ccl_coll_bcast) {
-        return algo.bcast == ccl_coll_bcast_topo_ring;
+        return algo.bcast == ccl_coll_bcast_topo;
+    }
+    else if (param.ctype == ccl_coll_reduce) {
+        return algo.reduce == ccl_coll_reduce_topo;
+    }
+    else if (param.ctype == ccl_coll_reduce_scatter) {
+        return algo.reduce_scatter == ccl_coll_reduce_scatter_topo;
     }
 
     return false;
 }
 
+bool ccl_is_device_side_algo(const ccl_selector_param& param) {
+#ifndef CCL_ENABLE_SYCL
+    return false;
+#endif // CCL_ENABLE_SYCL
+
+    auto supported_colls = { ccl_coll_allgatherv,
+                             ccl_coll_allreduce,
+                             ccl_coll_bcast,
+                             ccl_coll_reduce,
+                             ccl_coll_reduce_scatter };
+    RETURN_FALSE_IF(!checkers::is_coll_supported(supported_colls, param.ctype),
+                    "coll ",
+                    ccl_coll_type_to_str(param.ctype),
+                    " is not supported");
+
+    ccl_coll_algo algo{};
+    auto& selector = ccl::global_data::get().algorithm_selector;
+
+    if (param.ctype == ccl_coll_allgatherv) {
+        algo.allgatherv = selector->get<ccl_coll_allgatherv>(param);
+    }
+    else if (param.ctype == ccl_coll_allreduce) {
+        algo.allreduce = selector->get<ccl_coll_allreduce>(param);
+    }
+    else if (param.ctype == ccl_coll_bcast) {
+        algo.bcast = selector->get<ccl_coll_bcast>(param);
+    }
+    else if (param.ctype == ccl_coll_reduce) {
+        algo.reduce = selector->get<ccl_coll_reduce>(param);
+    }
+    else if (param.ctype == ccl_coll_reduce_scatter) {
+        algo.reduce_scatter = selector->get<ccl_coll_reduce_scatter>(param);
+    }
+
+    return ccl_is_device_side_algo(algo, param);
+}
+
+bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
+    auto supported_colls = { ccl_coll_allgatherv,
+                             ccl_coll_allreduce,
+                             ccl_coll_bcast,
+                             ccl_coll_reduce,
+                             ccl_coll_reduce_scatter };
+    RETURN_FALSE_IF(!checkers::is_coll_supported(supported_colls, param.ctype),
+                    "coll is not supported");
+
+    size_t local_proc_count = ccl::global_data::get().executor->get_local_proc_count();
+    int comm_size = param.comm->size();
+
+    RETURN_FALSE_IF(!checkers::is_gpu_stream(param), "non-gpu stream is not supported");
+    RETURN_FALSE_IF(checkers::is_sycl_buf(param), "sycl buffer is not supported");
+    RETURN_FALSE_IF(!checkers::is_device_buf(param), "non-device buffers is not supported");
+    RETURN_FALSE_IF(!checkers::is_l0_backend(param), "non-l0 backend is not supported");
+
+    RETURN_FALSE_IF(ccl::global_data::env().enable_fusion, "fusion is not supported");
+    RETURN_FALSE_IF(ccl::global_data::env().enable_unordered_coll,
+                    "unordered coll is not supported");
+    RETURN_FALSE_IF(ccl::global_data::env().priority_mode != ccl_priority_none, "wrong priority");
+    RETURN_FALSE_IF(ccl::global_data::env().worker_count != 1, "unsupported count of workers");
+
+#ifdef CCL_ENABLE_SYCL
+    if (!ccl::global_data::env().disable_ze_family_check) {
+        RETURN_FALSE_IF(
+            checkers::is_family1_card(param) &&
+                (((!checkers::is_single_card(param) &&
+                   ((param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
+                     param.ctype == ccl_coll_allgatherv)))) ||
+                 (param.ctype == ccl_coll_reduce_scatter)),
+            "family1 multi-card for ",
+            ccl_coll_type_to_str(param.ctype),
+            " is not supported");
+    }
+#endif // CCL_ENABLE_SYCL
+
+    RETURN_FALSE_IF((((param.ctype == ccl_coll_bcast) || (param.ctype == ccl_coll_reduce)) &&
+                     ((comm_size < 2) || (local_proc_count == 1))) ||
+                        ((param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce) &&
+                         (comm_size <= 2) && (local_proc_count == 1)),
+                    "unsupported comm size for ",
+                    ccl_coll_type_to_str(param.ctype));
+
+    RETURN_FALSE_IF((param.ctype == ccl_coll_bcast || param.ctype == ccl_coll_reduce_scatter) &&
+                        !checkers::is_single_node(param),
+                    "multi-node for ",
+                    ccl_coll_type_to_str(param.ctype),
+                    " is not supported");
+
+    RETURN_FALSE_IF(((param.ctype == ccl_coll_reduce) && (comm_size % local_proc_count != 0)),
+                    "ppn must be equal");
+
+    RETURN_FALSE_IF(param.ctype == ccl_coll_allgatherv && !checkers::is_single_card(param) &&
+                        comm_size % local_proc_count != 0,
+                    "ppn must be equal");
+
+    RETURN_FALSE_IF(!checkers::is_single_card(param) && !checkers::is_single_node(param) &&
+                        (local_proc_count % 2 != 0),
+                    "odd proc count per node is not supported");
+    return true;
+}
+
 bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
-    // A regular type, so we don't need to check for an additional support
+    // regular datatype, don't need to check for an additional support
     if (param.dtype.idx() != ccl::datatype::bfloat16 &&
         param.dtype.idx() != ccl::datatype::float16) {
         return true;
@@ -74,10 +292,10 @@ bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
 
     bool device_side_algo = ccl_is_device_side_algo(algo, param);
 
-    // Algorithms running on GPU device support both fp16 and bf16, so we don't need to require their
-    // support on the host.
+    // algorithms running on device side support fp16 and bf16 both
+    // so we don't need to require their support on the host
     if (!device_side_algo) {
-        if (param.dtype.idx() == ccl::datatype::bfloat16) {
+        if (param.dtype == ccl::datatype::bfloat16) {
             bool bf16_hw_support =
                 ccl::global_data::env().bf16_impl_type != ccl_bf16_no_hardware_support;
             bool bf16_compiler_support =
@@ -94,7 +312,7 @@ bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
                           bf16_compiler_support);
             }
         }
-        else if (param.dtype.idx() == ccl::datatype::float16) {
+        else if (param.dtype == ccl::datatype::float16) {
             bool fp16_hw_support =
                 ccl::global_data::env().fp16_impl_type != ccl_fp16_no_hardware_support;
             bool fp16_compiler_support =
@@ -115,71 +333,3 @@ bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
 
     return can_use;
 }
-
-bool ccl_is_topo_ring_algo(const ccl_selector_param& param) {
-#ifndef CCL_ENABLE_SYCL
-    return false;
-#endif // CCL_ENABLE_SYCL
-
-    if ((param.ctype != ccl_coll_allreduce) && (param.ctype != ccl_coll_bcast) &&
-        (param.ctype != ccl_coll_reduce)) {
-        return false;
-    }
-
-    bool res = false;
-
-    auto& selector = ccl::global_data::get().algorithm_selector;
-
-    if (param.ctype == ccl_coll_allreduce) {
-        res = (selector->get<ccl_coll_allreduce>(param) == ccl_coll_allreduce_topo_ring);
-    }
-    else if (param.ctype == ccl_coll_bcast) {
-        res = (selector->get<ccl_coll_bcast>(param) == ccl_coll_bcast_topo_ring);
-    }
-    else if (param.ctype == ccl_coll_reduce) {
-        res = (selector->get<ccl_coll_reduce>(param) == ccl_coll_reduce_topo_ring);
-    }
-
-    return res;
-}
-
-bool ccl_can_use_topo_ring_algo(const ccl_selector_param& param) {
-    if ((param.ctype != ccl_coll_allreduce) && (param.ctype != ccl_coll_bcast) &&
-        (param.ctype != ccl_coll_reduce)) {
-        return false;
-    }
-
-    bool is_sycl_buf = false;
-    bool is_device_buf = true;
-    bool is_l0_backend = false;
-
-    size_t local_proc_count = ccl::global_data::get().executor->get_local_proc_count();
-
-#ifdef CCL_ENABLE_SYCL
-    is_sycl_buf = param.is_sycl_buf;
-    if (param.buf && param.stream) {
-        auto ctx = param.stream->get_native_stream().get_context();
-        is_device_buf =
-            (sycl::get_pointer_type(param.buf, ctx) == sycl::usm::alloc::device) ? true : false;
-    }
-#ifdef MULTI_GPU_SUPPORT
-    if (param.stream && param.stream->get_backend() == sycl::backend::level_zero) {
-        is_l0_backend = true;
-    }
-#endif // MULTI_GPU_SUPPORT
-#endif // CCL_ENABLE_SYCL
-
-    if ((param.comm->size() != 2 && param.comm->size() != 4) ||
-        (param.comm->size() == 2 && param.comm->size() != static_cast<int>(local_proc_count)) ||
-        (param.comm->size() == 4 && local_proc_count != 2 && local_proc_count != 4) ||
-        (param.comm->size() != 2 && (ccl::global_data::env().atl_transport == ccl_atl_mpi)) ||
-        !param.stream || (param.stream->get_type() != stream_type::gpu) || is_sycl_buf ||
-        !is_device_buf || !is_l0_backend || ccl::global_data::env().enable_fusion ||
-        ccl::global_data::env().enable_unordered_coll ||
-        (ccl::global_data::env().priority_mode != ccl_priority_none) ||
-        (ccl::global_data::env().worker_count != 1)) {
-        return false;
-    }
-
-    return true;
-}
diff --git a/src/coll/selection/selection.hpp b/src/coll/selection/selection.hpp
index 2a8fe2a28..9f3fc34dc 100644
--- a/src/coll/selection/selection.hpp
+++ b/src/coll/selection/selection.hpp
@@ -17,8 +17,9 @@
 
 #include "coll/selection/selector_wrapper.hpp"
 
-bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param);
-
 bool ccl_is_direct_algo(const ccl_selector_param& param);
-bool ccl_is_topo_ring_algo(const ccl_selector_param& param);
-bool ccl_can_use_topo_ring_algo(const ccl_selector_param& param);
+bool ccl_is_device_side_algo(const ccl_selector_param& param);
+
+bool ccl_can_use_topo_algo(const ccl_selector_param& param);
+
+bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param);
diff --git a/src/coll/selection/selector_allgatherv.cpp b/src/coll/selection/selector_allgatherv.cpp
index 28e5ebaa6..92311364d 100644
--- a/src/coll/selection/selector_allgatherv.cpp
+++ b/src/coll/selection/selector_allgatherv.cpp
@@ -15,6 +15,8 @@
 */
 #include "coll/selection/selection.hpp"
 
+#include <numeric>
+
 template <>
 std::map<ccl_coll_allgatherv_algo, std::string>
     ccl_algorithm_selector_helper<ccl_coll_allgatherv_algo>::algo_names = {
@@ -22,7 +24,8 @@ std::map<ccl_coll_allgatherv_algo, std::string>
         std::make_pair(ccl_coll_allgatherv_naive, "naive"),
         std::make_pair(ccl_coll_allgatherv_ring, "ring"),
         std::make_pair(ccl_coll_allgatherv_flat, "flat"),
-        std::make_pair(ccl_coll_allgatherv_multi_bcast, "multi_bcast")
+        std::make_pair(ccl_coll_allgatherv_multi_bcast, "multi_bcast"),
+        std::make_pair(ccl_coll_allgatherv_topo, "topo")
     };
 
 ccl_algorithm_selector<ccl_coll_allgatherv>::ccl_algorithm_selector() {
@@ -33,8 +36,9 @@ ccl_algorithm_selector<ccl_coll_allgatherv>::ccl_algorithm_selector() {
                CCL_SELECTION_MAX_COLL_SIZE,
                ccl_coll_allgatherv_ring);
     }
-    else if (ccl::global_data::env().atl_transport == ccl_atl_mpi)
+    else if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allgatherv_direct);
+    }
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allgatherv_flat);
 }
@@ -46,15 +50,21 @@ bool ccl_algorithm_selector_helper<ccl_coll_allgatherv_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_allgatherv_algo>& table) {
     bool can_use = true;
 
-    if (param.is_vector_buf && algo != ccl_coll_allgatherv_flat &&
-        algo != ccl_coll_allgatherv_multi_bcast)
+    if (algo == ccl_coll_allgatherv_topo && !ccl_can_use_topo_algo(param)) {
         can_use = false;
-    else if (ccl::global_data::env().atl_transport == ccl_atl_mpi &&
-             algo == ccl_coll_allgatherv_multi_bcast)
+    }
+    else if (param.is_vector_buf && algo != ccl_coll_allgatherv_flat &&
+             algo != ccl_coll_allgatherv_multi_bcast) {
         can_use = false;
+    }
+    else if (algo == ccl_coll_allgatherv_multi_bcast &&
+             ccl::global_data::env().atl_transport == ccl_atl_mpi) {
+        can_use = false;
+    }
     else if (algo == ccl_coll_allgatherv_direct &&
-             (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+             ccl::global_data::env().atl_transport == ccl_atl_ofi) {
         can_use = false;
+    }
 
     return can_use;
 }
@@ -63,11 +73,11 @@ CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_allgatherv_algo,
                                     ccl_coll_allgatherv,
                                     ccl::global_data::env().allgatherv_algo_raw,
                                     ({
-                                        CCL_ASSERT(param.recv_counts);
-                                        size_t count = 0;
-                                        for (int idx = 0; idx < param.comm->size(); idx++) {
-                                            count += param.recv_counts[idx];
-                                        }
+                                        CCL_THROW_IF_NOT(param.recv_counts);
+                                        size_t count =
+                                            std::accumulate(param.recv_counts,
+                                                            param.recv_counts + param.comm->size(),
+                                                            0);
                                         count /= param.comm->size();
                                         count;
                                     }));
diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp
index 2ca2aa831..1c7dc96d7 100644
--- a/src/coll/selection/selector_allreduce.cpp
+++ b/src/coll/selection/selector_allreduce.cpp
@@ -20,33 +20,42 @@ std::map<ccl_coll_allreduce_algo, std::string>
     ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::algo_names = {
         std::make_pair(ccl_coll_allreduce_direct, "direct"),
         std::make_pair(ccl_coll_allreduce_rabenseifner, "rabenseifner"),
-        std::make_pair(ccl_coll_allreduce_starlike, "starlike"),
+        std::make_pair(ccl_coll_allreduce_nreduce, "nreduce"),
         std::make_pair(ccl_coll_allreduce_ring, "ring"),
         std::make_pair(ccl_coll_allreduce_ring_rma, "ring_rma"),
         std::make_pair(ccl_coll_allreduce_double_tree, "double_tree"),
         std::make_pair(ccl_coll_allreduce_recursive_doubling, "recursive_doubling"),
         std::make_pair(ccl_coll_allreduce_2d, "2d"),
-        std::make_pair(ccl_coll_allreduce_topo_ring, "topo_ring")
+        std::make_pair(ccl_coll_allreduce_topo, "topo"),
     };
 
 ccl_algorithm_selector<ccl_coll_allreduce>::ccl_algorithm_selector() {
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_topo_ring);
-#else // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_topo);
+    if (ccl::global_data::env().atl_transport == ccl_atl_ofi) {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_ring);
+        insert(
+            fallback_table, 0, CCL_ALLREDUCE_SHORT_MSG_SIZE, ccl_coll_allreduce_recursive_doubling);
+    }
+    else {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_direct);
+    }
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     if (ccl::global_data::env().atl_transport == ccl_atl_ofi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_ring);
         insert(main_table, 0, CCL_ALLREDUCE_SHORT_MSG_SIZE, ccl_coll_allreduce_recursive_doubling);
         insert(main_table,
                CCL_ALLREDUCE_SHORT_MSG_SIZE + 1,
                CCL_ALLREDUCE_MEDIUM_MSG_SIZE,
-               ccl_coll_allreduce_starlike);
+               ccl_coll_allreduce_nreduce);
     }
-    else if (ccl::global_data::env().atl_transport == ccl_atl_mpi)
+    else if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_direct);
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+    }
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_allreduce_ring);
     insert(fallback_table, 0, CCL_ALLREDUCE_SHORT_MSG_SIZE, ccl_coll_allreduce_recursive_doubling);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
 
 template <>
@@ -63,9 +72,9 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
     if (algo == ccl_coll_allreduce_rabenseifner &&
         static_cast<int>(param.count) < param.comm->pof2())
         can_use = false;
-    else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.out.enable_rma)
+    else if (algo == ccl_coll_allreduce_ring_rma && !atl_base_comm::attr.out.enable_rma)
         can_use = false;
-    else if (algo == ccl_coll_allreduce_starlike && !(param.count / param.comm->size()))
+    else if (algo == ccl_coll_allreduce_nreduce && !(param.count / param.comm->size()))
         can_use = false;
     else if (algo == ccl_coll_allreduce_2d &&
              (ccl::global_data::env().atl_transport == ccl_atl_mpi))
@@ -73,7 +82,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
     else if (algo == ccl_coll_allreduce_direct &&
              (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
-    else if (algo == ccl_coll_allreduce_topo_ring && !ccl_can_use_topo_ring_algo(param))
+    else if (algo == ccl_coll_allreduce_topo && !ccl_can_use_topo_algo(param))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_bcast.cpp b/src/coll/selection/selector_bcast.cpp
index 786bc22e1..604e1c54b 100644
--- a/src/coll/selection/selector_bcast.cpp
+++ b/src/coll/selection/selector_bcast.cpp
@@ -22,13 +22,13 @@ std::map<ccl_coll_bcast_algo, std::string>
         std::make_pair(ccl_coll_bcast_ring, "ring"),
         std::make_pair(ccl_coll_bcast_double_tree, "double_tree"),
         std::make_pair(ccl_coll_bcast_naive, "naive"),
-        std::make_pair(ccl_coll_bcast_topo_ring, "topo_ring")
+        std::make_pair(ccl_coll_bcast_topo, "topo")
     };
 
 ccl_algorithm_selector<ccl_coll_bcast>::ccl_algorithm_selector() {
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_topo_ring);
-#else // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_topo);
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     if (ccl::global_data::env().atl_transport == ccl_atl_ofi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_naive);
         insert(main_table, 0, CCL_BCAST_SHORT_MSG_SIZE, ccl_coll_bcast_double_tree);
@@ -36,7 +36,7 @@ ccl_algorithm_selector<ccl_coll_bcast>::ccl_algorithm_selector() {
     else if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_direct);
     }
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_bcast_naive);
 }
@@ -57,10 +57,12 @@ bool ccl_algorithm_selector_helper<ccl_coll_bcast_algo>::can_use(
         can_use = false;
     }
     else if (algo == ccl_coll_bcast_direct &&
-             (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+             (ccl::global_data::env().atl_transport == ccl_atl_ofi)) {
         can_use = false;
-    else if (algo == ccl_coll_bcast_topo_ring && !ccl_can_use_topo_ring_algo(param))
+    }
+    else if (algo == ccl_coll_bcast_topo && !ccl_can_use_topo_algo(param)) {
         can_use = false;
+    }
 
     return can_use;
 }
diff --git a/src/coll/selection/selector_impl.hpp b/src/coll/selection/selector_impl.hpp
index 92bbe443f..6952d4ac4 100644
--- a/src/coll/selection/selector_impl.hpp
+++ b/src/coll/selection/selector_impl.hpp
@@ -29,6 +29,8 @@
 #define CCL_SELECTION_ALGO_DELIMETER    ':'
 #define CCL_SELECTION_SIZE_DELIMETER    '-'
 
+std::string to_string(const ccl_selector_param& param);
+
 template <typename algo_group_type>
 void ccl_selection_unpack_elem(size_t& size,
                                algo_group_type& algo,
@@ -74,7 +76,7 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
         try {
             if (!std::getline(block_stream, algo_name_str, CCL_SELECTION_ALGO_DELIMETER))
                 CCL_THROW(
-                    "can't parse algorithm name from string: ", str_to_parse, ", block: ", block);
+                    "can not parse algorithm name from string: ", str_to_parse, ", block: ", block);
         }
         catch (const std::istream::failure& e) {
             LOG_ERROR("exception happened: ",
@@ -85,7 +87,8 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
                       block_stream.eof(),
                       "\nbadbit: ",
                       block_stream.bad());
-            CCL_THROW("can't parse algorithm name from string: ", str_to_parse, ", block: ", block);
+            CCL_THROW(
+                "can not parse algorithm name from string: ", str_to_parse, ", block: ", block);
         }
 
         LOG_TRACE("block ", block, ", algo_name_str ", algo_name_str);
@@ -103,7 +106,7 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
                 block_stream.str(block.substr(algo_name_str.length() + 1));
                 if (!std::getline(block_stream, size_str, CCL_SELECTION_SIZE_DELIMETER))
                     CCL_THROW(
-                        "can't parse left size from string: ", str_to_parse, ", block: ", block);
+                        "can not parse left size from string: ", str_to_parse, ", block: ", block);
                 if (!size_str.compare(CCL_SELECTION_MAX_COLL_SIZE_STR))
                     left_size = CCL_SELECTION_MAX_COLL_SIZE;
                 else
@@ -111,14 +114,17 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
             }
             catch (const std::exception& e) {
                 LOG_ERROR("exception happened during left size parsing: ", e.what());
-                CCL_THROW("can't parse left size from string: ", str_to_parse, ", block: ", block);
+                CCL_THROW(
+                    "can not parse left size from string: ", str_to_parse, ", block: ", block);
             }
 
             try {
                 block_stream.str(block.substr(algo_name_str.length() + size_str.length() + 2));
                 if (!std::getline(block_stream, size_str, CCL_SELECTION_SIZE_DELIMETER))
-                    CCL_THROW(
-                        "can't parse second size from string: ", str_to_parse, ", block: ", block);
+                    CCL_THROW("can not parse second size from string: ",
+                              str_to_parse,
+                              ", block: ",
+                              block);
                 if (!size_str.compare(CCL_SELECTION_MAX_COLL_SIZE_STR))
                     right_size = CCL_SELECTION_MAX_COLL_SIZE;
                 else
@@ -126,7 +132,8 @@ void ccl_algorithm_selector_base<algo_group_type>::init() {
             }
             catch (const std::exception& e) {
                 LOG_ERROR("exception happened during right size parsing: ", e.what());
-                CCL_THROW("can't parse right size from string: ", str_to_parse, ", block: ", block);
+                CCL_THROW(
+                    "can not parse right size from string: ", str_to_parse, ", block: ", block);
             }
 
             LOG_TRACE("algo ", algo_name_str, ", left ", left_size, ", right ", right_size);
@@ -238,13 +245,15 @@ algo_group_type ccl_algorithm_selector_base<algo_group_type>::get(
     algo_group_type elem_algo;
     ccl_selection_border_type elem_border;
 
+    LOG_DEBUG("param: ", ::to_string(param));
+
     size_t count = ccl_algorithm_selector_helper<algo_group_type>::get_count(param);
 
     if (param.hint_algo.has_value()) {
         elem_algo = static_cast<algo_group_type>(param.hint_algo.value);
         if (!ccl_algorithm_selector_helper<algo_group_type>::can_use(
                 elem_algo, param, main_table)) {
-            LOG_DEBUG("can't select hint algorithm: coll ",
+            LOG_DEBUG("can not select hint algorithm: coll ",
                       ccl_coll_type_to_str(param.ctype),
                       ", count ",
                       count,
@@ -269,16 +278,23 @@ algo_group_type ccl_algorithm_selector_base<algo_group_type>::get(
 
     if (lower_bound == main_table.end() ||
         !ccl_algorithm_selector_helper<algo_group_type>::can_use(elem_algo, param, main_table)) {
+        CCL_THROW_IF_NOT(ccl::global_data::env().enable_algo_fallback,
+                         "can not select algo from main table and fallback is disabled",
+                         ", coll ",
+                         ccl_coll_type_to_str(param.ctype),
+                         ", count ",
+                         count);
+
         lower_bound = fallback_table.lower_bound(size);
         ccl_selection_unpack_elem(elem_size, elem_algo, elem_border, lower_bound, fallback_table);
         CCL_THROW_IF_NOT(lower_bound != fallback_table.end(),
-                         "can't select algorithm: coll ",
+                         "can not select algorithm: coll ",
                          ccl_coll_type_to_str(param.ctype),
                          ", count ",
                          count);
         CCL_THROW_IF_NOT(ccl_algorithm_selector_helper<algo_group_type>::can_use(
                              elem_algo, param, fallback_table),
-                         "can't select algorithm in fallback_table: coll ",
+                         "can not select algorithm in fallback_table: coll ",
                          ccl_coll_type_to_str(param.ctype));
     }
 
diff --git a/src/coll/selection/selector_reduce.cpp b/src/coll/selection/selector_reduce.cpp
index 4c13ea035..5e91b1e7c 100644
--- a/src/coll/selection/selector_reduce.cpp
+++ b/src/coll/selection/selector_reduce.cpp
@@ -22,20 +22,20 @@ std::map<ccl_coll_reduce_algo, std::string>
         std::make_pair(ccl_coll_reduce_rabenseifner, "rabenseifner"),
         std::make_pair(ccl_coll_reduce_tree, "tree"),
         std::make_pair(ccl_coll_reduce_double_tree, "double_tree"),
-        std::make_pair(ccl_coll_reduce_topo_ring, "topo_ring")
+        std::make_pair(ccl_coll_reduce_topo, "topo")
     };
 
 ccl_algorithm_selector<ccl_coll_reduce>::ccl_algorithm_selector() {
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_topo_ring);
-#else // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_topo);
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     if (ccl::global_data::env().atl_transport == ccl_atl_ofi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_tree);
     }
     else if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_direct);
     }
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_tree);
 }
@@ -56,7 +56,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_reduce_algo>::can_use(
     else if (algo == ccl_coll_reduce_direct &&
              (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
-    else if (algo == ccl_coll_reduce_topo_ring && !ccl_can_use_topo_ring_algo(param))
+    else if (algo == ccl_coll_reduce_topo && !ccl_can_use_topo_algo(param))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_reduce_scatter.cpp b/src/coll/selection/selector_reduce_scatter.cpp
index 3d8f67e01..e68b369dc 100644
--- a/src/coll/selection/selector_reduce_scatter.cpp
+++ b/src/coll/selection/selector_reduce_scatter.cpp
@@ -20,13 +20,18 @@ std::map<ccl_coll_reduce_scatter_algo, std::string>
     ccl_algorithm_selector_helper<ccl_coll_reduce_scatter_algo>::algo_names = {
         std::make_pair(ccl_coll_reduce_scatter_direct, "direct"),
         std::make_pair(ccl_coll_reduce_scatter_ring, "ring"),
+        std::make_pair(ccl_coll_reduce_scatter_topo, "topo"),
     };
 
 ccl_algorithm_selector<ccl_coll_reduce_scatter>::ccl_algorithm_selector() {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_topo);
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     if (ccl::global_data::env().atl_transport == ccl_atl_ofi)
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_ring);
     else if (ccl::global_data::env().atl_transport == ccl_atl_mpi)
         insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_direct);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_reduce_scatter_ring);
 }
@@ -38,8 +43,11 @@ bool ccl_algorithm_selector_helper<ccl_coll_reduce_scatter_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_reduce_scatter_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_reduce_scatter_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+    if (algo == ccl_coll_reduce_scatter_topo && !ccl_can_use_topo_algo(param)) {
+        can_use = false;
+    }
+    else if (algo == ccl_coll_reduce_scatter_direct &&
+             (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/common/comm/atl_tag.cpp b/src/common/comm/atl_tag.cpp
index 2bca7ba61..a60dd25e6 100644
--- a/src/common/comm/atl_tag.cpp
+++ b/src/common/comm/atl_tag.cpp
@@ -16,12 +16,12 @@
 #include "common/comm/atl_tag.hpp"
 #include "exec/exec.hpp"
 
-void ccl_atl_tag::print() {
-    LOG_INFO("atl-tag:");
-    LOG_INFO("  bits: ", tag_bits);
-    LOG_INFO("  max: ", max_tag);
-    LOG_INFO("  mask: ", max_tag_mask);
-    LOG_INFO("  pof2: ", ccl_pof2(max_tag));
+std::string ccl_atl_tag::to_string() const {
+    std::stringstream ss;
+    ss << "{ "
+       << "bits: " << tag_bits << ", max: " << max_tag << ", mask: " << max_tag_mask
+       << ", pof2: " << ccl_pof2(max_tag) << " }";
+    return ss.str();
 }
 
 uint64_t ccl_atl_tag::create(int rank,
diff --git a/src/common/comm/atl_tag.hpp b/src/common/comm/atl_tag.hpp
index 4c9a46cfc..ac5e20b8a 100644
--- a/src/common/comm/atl_tag.hpp
+++ b/src/common/comm/atl_tag.hpp
@@ -41,7 +41,7 @@ class ccl_atl_tag {
 
     ~ccl_atl_tag() = default;
 
-    void print();
+    std::string to_string() const;
 
     /**
      * Generates the tag to be used by ATL communication operations
diff --git a/src/common/comm/comm.cpp b/src/common/comm/comm.cpp
index fe6e8062e..98f6832a1 100644
--- a/src/common/comm/comm.cpp
+++ b/src/common/comm/comm.cpp
@@ -13,138 +13,267 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "atl/atl_base_comm.hpp"
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h"
 #include "exec/exec.hpp"
+#include "coll/coll.hpp"
+#include "coll/coll_common_attributes.hpp"
+#include "coll/ccl_allgather_op_attr.hpp"
 #include "common/comm/comm.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
+#include "common/comm/comm_impl.hpp"
 #include "common/global/global.hpp"
+#include "common/event/impls/host_event.hpp"
+#include "common/request/request.hpp"
 #include "sched/sched.hpp"
 #include "oneapi/ccl/types.hpp"
 #include "oneapi/ccl/kvs.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
 
-void ccl_comm::allocate_resources() {
-    if (ccl::global_data::env().enable_unordered_coll) {
-        unordered_coll_manager =
-            std::unique_ptr<ccl_unordered_coll_manager>(new ccl_unordered_coll_manager(*this));
+ccl_comm_internal::ccl_comm_internal(int rank, int size, std::shared_ptr<atl_base_comm> atl)
+        : ccl_comm_internal(rank, size, atl->get_rank2rank_map(), atl) {}
+
+ccl_comm_internal::ccl_comm_internal(int rank,
+                                     int size,
+                                     ccl_rank2rank_map&& rank_map,
+                                     std::shared_ptr<atl_base_comm> atl)
+        : atl(atl),
+          m_local2global_map(std::move(rank_map)),
+          m_dtree(size, rank) {
+    reset(rank, size);
+}
+
+ccl_comm_internal::ccl_comm_internal(const std::vector<int>& local_ranks,
+                                     int comm_size,
+                                     std::shared_ptr<ccl::kvs_interface> kvs_instance)
+        : m_local2global_map(),
+          m_dtree(local_ranks.size(), comm_size) {
+    std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs_instance));
+
+    atl = atl_comm_manager::create_comm(comm_size, local_ranks, kvs_wrapper);
+
+    reset(atl->get_rank(), atl->get_size());
+}
+
+//TODO: will fix it after OFI refactoring
+int ccl_comm::get_global_rank(int rank, bool only_global) const {
+    // TODO: move map to ccl_comm?
+    const auto& local2global_map = comm_impl->get_local2global_map();
+
+    if (local2global_map.empty() || !only_global) {
+        // global comm and its copies do not have entries in the map
+        return rank;
     }
 
-    auto& env_object = ccl::global_data::env();
+    CCL_THROW_IF_NOT((int)local2global_map.size() > rank,
+                     "no rank ",
+                     rank,
+                     " was found in comm ",
+                     this,
+                     ", id ",
+                     id());
+    int global_rank = local2global_map[rank];
+    LOG_DEBUG("comm ", this, ", id ", id(), ", map rank ", rank, " to global ", global_rank);
+    return global_rank;
+}
 
-    allreduce_2d_builder = std::unique_ptr<ccl_allreduce_2d_builder>(new ccl_allreduce_2d_builder(
-        (env_object.allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
-            ? env_object.allreduce_2d_base_size
-            : ccl::global_data::get().executor->get_local_proc_count(),
-        env_object.allreduce_2d_switch_dims,
-        this));
+int ccl_comm::get_rank_from_global(int global_rank) const {
+    const auto& local2global_map = comm_impl->get_local2global_map();
+
+    if (local2global_map.empty()) {
+        // global comm and its copies do not have entries in the map
+        return global_rank;
+    }
+
+    int rank = ccl_comm::invalid_rank;
+
+    for (size_t i = 0; i < local2global_map.size(); ++i) {
+        if (local2global_map[i] == global_rank) {
+            rank = static_cast<int>(i);
+            break;
+        }
+    }
+
+    CCL_THROW_IF_NOT(rank != ccl_comm::invalid_rank, "can't find rank");
+
+    return rank;
+}
 
-    env_object.print(m_rank);
+using ccl::preview::create_comm_split_attr;
+
+ccl_comm::ccl_comm()
+        : device(ccl::device_index_type(ccl::unused_index_value,
+                                        ccl::unused_index_value,
+                                        ccl::unused_index_value)),
+          comm_attr(create_comm_split_attr()) {}
+
+ccl_comm::ccl_comm(int size, ccl::shared_ptr_class<ikvs_wrapper> kvs)
+        : device(ccl::device_index_type(ccl::unused_index_value,
+                                        ccl::unused_index_value,
+                                        ccl::unused_index_value)),
+          comm_attr(create_comm_split_attr()),
+          comm_rank(0),
+          comm_size(size),
+          next_sched_id_internal(ccl_comm_internal::max_sched_count / 2),
+          next_sched_id_external(0) {
+    if (size <= 0) {
+        throw ccl::exception("Incorrect size value when creating a host communicator");
+    }
+}
+
+ccl_comm::ccl_comm(int size, int rank, ccl::shared_ptr_class<ikvs_wrapper> kvs)
+        : ccl_comm(atl_comm_manager::create_comm(size, { rank }, kvs)) {}
+
+ccl_comm::ccl_comm(ccl::unified_device_type&& d,
+                   ccl::unified_context_type&& c,
+                   std::shared_ptr<atl_base_comm> atl)
+        : device(std::move(d)),
+          context(std::move(c)),
+          comm_attr(create_comm_split_attr()),
+          comm_rank(atl->get_rank()),
+          comm_size(atl->get_size()),
+          comm_id(std::unique_ptr<ccl_comm_id_storage::comm_id>(
+              new ccl_comm_id_storage::comm_id(ccl::global_data::get().comm_ids->acquire()))),
+          next_sched_id_internal(ccl_comm_internal::max_sched_count / 2),
+          next_sched_id_external(0) {
+    int rank = atl->get_rank();
+    int size = atl->get_size();
+
+    if (rank > size || size <= 0) {
+        throw ccl::exception("incorrect rank or size when creating \
+                             a host communicator: rank: " +
+                             std::to_string(rank) + ", size: " + std::to_string(size));
+    }
+
+    LOG_DEBUG("ctor");
+
+    comm_impl = std::unique_ptr<ccl_comm_internal>(new ccl_comm_internal(rank, size, atl));
+
+    allocate_resources();
+    create_sub_comms(atl);
 }
 
+ccl_comm::ccl_comm(std::shared_ptr<atl_base_comm> atl)
+        : ccl_comm(ccl::device_index_type(ccl::unused_index_value,
+                                          ccl::unused_index_value,
+                                          ccl::unused_index_value),
+                   {},
+                   atl) {}
+
 ccl_comm::ccl_comm(int rank,
                    int size,
                    ccl_comm_id_storage::comm_id&& id,
-                   std::shared_ptr<atl_wrapper> atl,
+                   std::shared_ptr<atl_base_comm> atl,
                    bool share_resources,
-                   ccl::host_communicator* host_comm)
-        : ccl_comm(rank,
-                   size,
-                   std::move(id),
-                   ccl_rank2rank_map{},
-                   atl,
-                   share_resources,
-                   host_comm) {}
+                   bool is_sub_communicator)
+        : comm_impl(std::make_shared<ccl_comm_internal>(rank, size, atl->get_rank2rank_map(), atl)),
+          device(ccl::device_index_type(ccl::unused_index_value,
+                                        ccl::unused_index_value,
+                                        ccl::unused_index_value)),
+          comm_attr(create_comm_split_attr()),
+          comm_rank(rank),
+          comm_size(size),
+          comm_id(std::unique_ptr<ccl_comm_id_storage::comm_id>(
+              new ccl_comm_id_storage::comm_id(std::move(id)))),
+          next_sched_id_internal(ccl_comm_internal::max_sched_count / 2),
+          next_sched_id_external(0) {
+    if (!share_resources) {
+        allocate_resources();
+    }
+
+    if (!is_sub_communicator) {
+        create_sub_comms(comm_impl.get()->atl);
+    }
+}
 
 ccl_comm::ccl_comm(int rank,
                    int size,
                    ccl_comm_id_storage::comm_id&& id,
                    ccl_rank2rank_map&& rank_map,
-                   std::shared_ptr<atl_wrapper> atl,
+                   std::shared_ptr<atl_base_comm> atl,
                    bool share_resources,
-                   ccl::host_communicator* host_comm)
-        : atl(atl),
-          m_id(std::move(id)),
-          m_local2global_map(std::move(rank_map)),
-          m_dtree(size, rank),
-          thread_number(1),
-          on_process_ranks_number(1),
-          host_comm(host_comm) {
-    reset(rank, size);
-
+                   bool is_sub_communicator)
+        : comm_impl(std::make_shared<ccl_comm_internal>(rank, size, std::move(rank_map), atl)),
+          device(ccl::device_index_type(ccl::unused_index_value,
+                                        ccl::unused_index_value,
+                                        ccl::unused_index_value)),
+          comm_attr(create_comm_split_attr()),
+          comm_rank(rank),
+          comm_size(size),
+          comm_id(std::unique_ptr<ccl_comm_id_storage::comm_id>(
+              new ccl_comm_id_storage::comm_id(std::move(id)))),
+          next_sched_id_internal(ccl_comm_internal::max_sched_count / 2),
+          next_sched_id_external(0) {
     if (!share_resources) {
         allocate_resources();
     }
-}
 
-//TODO non-implemented
-//TODO rude simulation of multi-thread barrier
-static std::atomic<size_t> thread_counter{};
-static std::atomic<size_t> thread_ranks_counter{};
-void ccl_comm::ccl_comm_reset_thread_barrier() {
-    // recharge counters again
-    thread_counter.store(0);
-    thread_ranks_counter.store(0);
+    if (!is_sub_communicator) {
+        create_sub_comms(get_atl_comm());
+    }
 }
 
-ccl_comm::ccl_comm(const std::vector<int>& local_ranks,
-                   int comm_size,
-                   std::shared_ptr<ccl::kvs_interface> kvs_instance,
-                   ccl_comm_id_storage::comm_id&& id,
-                   bool share_resources,
-                   ccl::host_communicator* host_comm)
-        : m_id(std::move(id)),
-          m_local2global_map(),
-          m_dtree(local_ranks.size(), comm_size),
-          host_comm(host_comm) {
-    std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs_instance));
-
-    atl = std::shared_ptr<atl_wrapper>(new atl_wrapper(comm_size, local_ranks, kvs_wrapper));
+ccl_comm::ccl_comm(const ccl_comm& src, ccl_comm_id_storage::comm_id&& id)
+        : comm_impl(src.comm_impl),
+          device(ccl::device_index_type(ccl::unused_index_value,
+                                        ccl::unused_index_value,
+                                        ccl::unused_index_value)),
+          r2r_comm(src.r2r_comm),
+          node_comm(src.node_comm),
+          even_comm(src.even_comm),
+          pair_comm(src.pair_comm),
+          comm_attr(create_comm_split_attr()),
+          comm_rank(src.rank()),
+          comm_size(src.size()),
+          comm_id(std::unique_ptr<ccl_comm_id_storage::comm_id>(
+              new ccl_comm_id_storage::comm_id(std::move(id)))),
+          next_sched_id_internal(ccl_comm_internal::max_sched_count / 2),
+          next_sched_id_external(0) {}
 
-    thread_number = atl->get_threads_per_process();
-    on_process_ranks_number = atl->get_ranks_per_process();
-
-    reset(atl->get_rank(), atl->get_size());
+ccl::device_index_type ccl_comm::get_device_path() const {
+    return ccl::device_index_type{ ccl::unused_index_value,
+                                   ccl::unused_index_value,
+                                   ccl::unused_index_value };
+}
 
-    if (!share_resources) {
-        allocate_resources();
-    }
+ccl::communicator_interface::device_t ccl_comm::get_device() const {
+    CCL_THROW(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+    static ccl::communicator_interface::device_t empty;
+    return empty;
 }
 
-ccl_comm* ccl_comm::create_with_colors(const std::vector<int>& colors,
-                                       ccl_comm_id_storage* comm_ids,
-                                       const ccl_comm* parent_comm,
-                                       bool share_resources) {
-    ccl_rank2rank_map rank_map;
-    int new_comm_size = 0;
-    int new_comm_rank = 0;
-    int color = colors[parent_comm->rank()];
-
-    for (int i = 0; i < parent_comm->size(); ++i) {
-        if (colors[i] == color) {
-            LOG_DEBUG("map local rank ", new_comm_size, " to global ", i);
-            rank_map.emplace_back(i);
-            ++new_comm_size;
-            if (i < parent_comm->rank()) {
-                ++new_comm_rank;
-            }
-        }
-    }
+ccl::communicator_interface::context_t ccl_comm::get_context() const {
+    CCL_THROW(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+    static ccl::communicator_interface::context_t empty;
+    return empty;
+}
 
-    if (new_comm_size == 0) {
-        throw ccl::exception(std::string("no colors matched to ") + std::to_string(color) +
-                             " seems to be exchange issue");
-    }
+void ccl_comm::create_sub_comms(std::shared_ptr<atl_base_comm> atl) {
+    ccl::global_data& data = ccl::global_data::get();
 
-    if (new_comm_size == parent_comm->size()) {
-        // exact copy of the global communicator, use empty map
-        rank_map.clear();
-    }
+    r2r_comm = std::shared_ptr<ccl_comm>(
+        this->create_with_color(atl->get_r2r_color(), data.comm_ids.get(), true));
+    node_comm = std::shared_ptr<ccl_comm>(
+        this->create_with_color(atl->get_host_color(), data.comm_ids.get(), true));
+    even_comm = std::shared_ptr<ccl_comm>(this->create_with_color(
+        atl->get_host_color() + atl->get_rank() % 2, data.comm_ids.get(), true));
+    pair_comm = std::shared_ptr<ccl_comm>(this->create_with_color(
+        atl->get_host_color() + atl->get_rank() / 2, data.comm_ids.get(), true));
+}
 
-    ccl_comm* comm = new ccl_comm(new_comm_rank,
-                                  new_comm_size,
+ccl_comm* ccl_comm::create_with_color(int color,
+                                      ccl_comm_id_storage* comm_ids,
+                                      bool share_resources) const {
+    std::shared_ptr<atl_base_comm> atl_comm = get_atl_comm()->comm_split(color);
+    ccl_comm* comm = new ccl_comm(atl_comm->get_rank(),
+                                  atl_comm->get_size(),
                                   comm_ids->acquire(),
-                                  std::move(rank_map),
-                                  parent_comm->atl,
-                                  share_resources);
+                                  atl_comm->get_rank2rank_map(),
+                                  atl_comm,
+                                  share_resources,
+                                  true);
 
     LOG_DEBUG("new comm: color ",
               color,
@@ -158,32 +287,306 @@ ccl_comm* ccl_comm::create_with_colors(const std::vector<int>& colors,
     return comm;
 }
 
-std::shared_ptr<ccl_comm> ccl_comm::clone_with_new_id(ccl_comm_id_storage::comm_id&& id) {
-    ccl_rank2rank_map rank_map{ m_local2global_map };
-    return std::make_shared<ccl_comm>(m_rank,
-                                      m_size,
-                                      std::move(id),
-                                      std::move(rank_map),
-                                      atl,
-                                      true /*share_resources*/,
-                                      get_host_comm());
-}
-
-int ccl_comm::get_global_rank(int rank) const {
-    if (m_local2global_map.empty()) {
-        // global comm and its copies do not have entries in the map
-        return rank;
+ccl::communicator_interface_ptr ccl_comm::split(const ccl::comm_split_attr& attr) {
+    if (!attr.is_valid<ccl::comm_split_attr_id::color>()) {
+        CCL_THROW(std::string(__FUNCTION__) +
+                  " - 'Color' split attribute for host communicator is not set");
     }
 
-    CCL_THROW_IF_NOT((int)m_local2global_map.size() > rank,
-                     "no rank ",
-                     rank,
-                     " was found in comm ",
-                     this,
-                     ", id ",
-                     m_id.value());
-    int global_rank = m_local2global_map[rank];
-    LOG_DEBUG(
-        "comm , ", this, " id ", m_id.value(), ", map rank ", rank, " to global ", global_rank);
-    return global_rank;
+    ccl::global_data& data = ccl::global_data::get();
+    auto new_comm = this->create_with_color(
+        attr.get<ccl::comm_split_attr_id::color>(), data.comm_ids.get(), true);
+
+    comm_attr = attr;
+
+    return std::shared_ptr<ccl_comm>(new_comm);
+}
+
+ccl::event ccl_comm::barrier(const ccl::stream::impl_value_t& stream,
+                             const ccl::barrier_attr& attr,
+                             const ccl::vector_class<ccl::event>& deps) {
+    return barrier_impl(stream, attr, deps);
+}
+
+ccl::event ccl_comm::barrier_impl(const ccl::stream::impl_value_t& stream,
+                                  const ccl::barrier_attr& attr,
+                                  const ccl::vector_class<ccl::event>& deps) {
+    ccl_barrier_impl(this, stream.get(), deps);
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(nullptr));
+}
+
+/* allgatherv */
+ccl::event ccl_comm::allgatherv_impl(const void* send_buf,
+                                     size_t send_count,
+                                     void* recv_buf,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     ccl::datatype dtype,
+                                     const ccl::stream::impl_value_t& stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_allgatherv_impl(send_buf,
+                                           send_count,
+                                           recv_buf,
+                                           recv_counts.data(),
+                                           dtype,
+                                           attr,
+                                           this,
+                                           get_stream_ptr(stream),
+                                           deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+ccl::event ccl_comm::allgatherv_impl(const void* send_buf,
+                                     size_t send_count,
+                                     const ccl::vector_class<void*>& recv_bufs,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     ccl::datatype dtype,
+                                     const ccl::stream::impl_value_t& stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
+    ccl_coll_attr internal_attr(attr);
+    internal_attr.is_vector_buf = 1;
+
+    ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
+                                           send_count,
+                                           (void*)(recv_bufs.data()),
+                                           recv_counts.data(),
+                                           dtype,
+                                           internal_attr,
+                                           this,
+                                           get_stream_ptr(stream),
+                                           deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+/* allreduce */
+ccl::event ccl_comm::allreduce_impl(const void* send_buf,
+                                    void* recv_buf,
+                                    size_t count,
+                                    ccl::datatype dtype,
+                                    ccl::reduction reduction,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::allreduce_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_allreduce_impl(
+        send_buf, recv_buf, count, dtype, reduction, attr, this, get_stream_ptr(stream), deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+/* alltoall */
+ccl::event ccl_comm::alltoall_impl(const void* send_buf,
+                                   void* recv_buf,
+                                   size_t count,
+                                   ccl::datatype dtype,
+                                   const ccl::stream::impl_value_t& stream,
+                                   const ccl::alltoall_attr& attr,
+                                   const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_alltoall_impl(
+        send_buf, recv_buf, count, dtype, attr, this, get_stream_ptr(stream), deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+ccl::event ccl_comm::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                   const ccl::vector_class<void*>& recv_buf,
+                                   size_t count,
+                                   ccl::datatype dtype,
+                                   const ccl::stream::impl_value_t& stream,
+                                   const ccl::alltoall_attr& attr,
+                                   const ccl::vector_class<ccl::event>& deps) {
+    // TODO not implemented
+    CCL_THROW(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+    return {};
+}
+
+/* alltoallv */
+ccl::event ccl_comm::alltoallv_impl(const void* send_buf,
+                                    const ccl::vector_class<size_t>& send_counts,
+                                    void* recv_buf,
+                                    const ccl::vector_class<size_t>& recv_counts,
+                                    ccl::datatype dtype,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::alltoallv_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_alltoallv_impl(send_buf,
+                                          send_counts.data(),
+                                          recv_buf,
+                                          recv_counts.data(),
+                                          dtype,
+                                          attr,
+                                          this,
+                                          get_stream_ptr(stream),
+                                          deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+ccl::event ccl_comm::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
+                                    const ccl::vector_class<size_t>& send_counts,
+                                    ccl::vector_class<void*> recv_buf,
+                                    const ccl::vector_class<size_t>& recv_counts,
+                                    ccl::datatype dtype,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::alltoallv_attr& attr,
+                                    const ccl::vector_class<ccl::event>& dep) {
+    // TODO not implemented
+    CCL_THROW(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+    return {};
+}
+
+/* bcast */
+ccl::event ccl_comm::broadcast_impl(void* buf,
+                                    size_t count,
+                                    ccl::datatype dtype,
+                                    int root,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::broadcast_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req =
+        ccl_broadcast_impl(buf, count, dtype, root, attr, this, get_stream_ptr(stream), deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+/* reduce */
+ccl::event ccl_comm::reduce_impl(const void* send_buf,
+                                 void* recv_buf,
+                                 size_t count,
+                                 ccl::datatype dtype,
+                                 ccl::reduction reduction,
+                                 int root,
+                                 const ccl::stream::impl_value_t& stream,
+                                 const ccl::reduce_attr& attr,
+                                 const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_reduce_impl(send_buf,
+                                       recv_buf,
+                                       count,
+                                       dtype,
+                                       reduction,
+                                       root,
+                                       attr,
+                                       this,
+                                       get_stream_ptr(stream),
+                                       deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+/* reduce_scatter */
+ccl::event ccl_comm::reduce_scatter_impl(const void* send_buf,
+                                         void* recv_buf,
+                                         size_t recv_count,
+                                         ccl::datatype dtype,
+                                         ccl::reduction reduction,
+                                         const ccl::stream::impl_value_t& stream,
+                                         const ccl::reduce_scatter_attr& attr,
+                                         const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_reduce_scatter_impl(
+        send_buf, recv_buf, recv_count, dtype, reduction, attr, this, get_stream_ptr(stream), deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+/* sparse_allreduce */
+ccl::event ccl_comm::sparse_allreduce_impl(const void* send_ind_buf,
+                                           size_t send_ind_count,
+                                           const void* send_val_buf,
+                                           size_t send_val_count,
+                                           void* recv_ind_buf,
+                                           size_t recv_ind_count,
+                                           void* recv_val_buf,
+                                           size_t recv_val_count,
+                                           ccl::datatype index_dtype,
+                                           ccl::datatype value_dtype,
+                                           ccl::reduction reduction,
+                                           const ccl::stream::impl_value_t& stream,
+                                           const ccl::sparse_allreduce_attr& attr,
+                                           const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_sparse_allreduce_impl(send_ind_buf,
+                                                 send_ind_count,
+                                                 send_val_buf,
+                                                 send_val_count,
+                                                 recv_ind_buf,
+                                                 recv_ind_count,
+                                                 recv_val_buf,
+                                                 recv_val_count,
+                                                 index_dtype,
+                                                 value_dtype,
+                                                 reduction,
+                                                 attr,
+                                                 this,
+                                                 get_stream_ptr(stream),
+                                                 deps);
+
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+}
+
+std::shared_ptr<atl_base_comm> ccl_comm::get_atl_comm() const {
+    return comm_impl->atl;
+}
+
+std::shared_ptr<ccl_comm> ccl_comm::get_r2r_comm() {
+    return r2r_comm;
+}
+
+std::shared_ptr<ccl_comm> ccl_comm::get_node_comm() {
+    return node_comm;
+}
+
+std::shared_ptr<ccl_comm> ccl_comm::get_pair_comm() {
+    return pair_comm;
+}
+
+std::shared_ptr<ccl_comm> ccl_comm::get_even_comm() {
+    return even_comm;
+}
+
+std::string ccl_comm::to_string() const {
+    std::stringstream ss;
+    ss << "{ rank: " << rank() << ", size: " << size() << ", id: " << id() << " }";
+    return ss.str();
+}
+
+std::string ccl_comm::to_string_ext() const {
+    std::stringstream ss;
+    ss << "{\n";
+    ss << "   " << to_string() << "\n";
+    ss << "   r2r_comm: " << (r2r_comm ? r2r_comm->to_string() : "{}") << "\n";
+    ss << "   node_comm: " << (node_comm ? node_comm->to_string() : "{}") << "\n";
+    ss << "   even_comm: " << (even_comm ? even_comm->to_string() : "{}") << "\n";
+    ss << "   pair_comm: " << (pair_comm ? pair_comm->to_string() : "{}") << "\n";
+    ss << "}";
+
+    return ss.str();
+}
+
+// NOTE: allocate_resources must be done on ccl_comm level, if it's called on ccl_comm_internal level
+// the ccl_comm object that we need won't be fully constructed
+void ccl_comm::allocate_resources() {
+    if (ccl::global_data::env().enable_unordered_coll) {
+        comm_impl->unordered_coll_manager.reset(new ccl_unordered_coll_manager(*this));
+    }
+
+    auto& env_object = ccl::global_data::env();
+
+    comm_impl->allreduce_2d_builder.reset(new ccl_allreduce_2d_builder(
+        (env_object.allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
+            ? env_object.allreduce_2d_base_size
+            : ccl::global_data::get().executor->get_local_proc_count(),
+        env_object.allreduce_2d_switch_dims,
+        this));
+
+    env_object.print(rank());
 }
+
+std::shared_ptr<ccl_comm> ccl_comm::clone_with_new_id(ccl_comm_id_storage::comm_id&& id) {
+    return std::shared_ptr<ccl_comm>(new ccl_comm(*this, std::move(id)));
+}
+
+COMM_INTERFACE_COLL_INSTANTIATION(ccl_comm);
+#ifdef CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(ccl_comm);
+#endif // CCL_ENABLE_SYCL
diff --git a/src/common/comm/comm.hpp b/src/common/comm/comm.hpp
index 77505c705..1356ec53e 100644
--- a/src/common/comm/comm.hpp
+++ b/src/common/comm/comm.hpp
@@ -17,53 +17,218 @@
 
 #include <atomic>
 #include <unordered_map>
-
-#include "atl/atl_wrapper.h"
+#include "atl/atl_base_comm.hpp"
 #include "coll/algorithms/allreduce/allreduce_2d.hpp"
+#include "common/comm/communicator_traits.hpp"
+#include "common/comm/comm_interface.hpp"
 #include "common/comm/comm_id_storage.hpp"
 #include "common/comm/atl_tag.hpp"
 #include "common/log/log.hpp"
+#include "common/stream/stream.hpp"
 #include "common/utils/tree.hpp"
 #include "common/utils/utils.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/event.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
+#include "types_generator_defines.hpp"
 #include "unordered_coll/unordered_coll.hpp"
 
 // index = local_rank, value = global_rank
 using ccl_rank2rank_map = std::vector<int>;
 
+class ikvs_wrapper;
+
+inline ccl_stream* get_stream_ptr(const ccl::stream::impl_value_t& stream) {
+    if (stream.get() && stream->is_sycl_device_stream())
+        return stream.get();
+    else
+        return nullptr;
+}
+
+using ccl_rank2rank_map = std::vector<int>;
+
+class ccl_comm;
 namespace ccl {
-class host_communicator;
 namespace v1 {
 class kvs_interface;
 }
 } // namespace ccl
 
-class alignas(CACHELINE_SIZE) ccl_comm {
+// The main purpose of the internal part is to hold shareable parts of ccl_comm which don't need to
+// be copied/reset on ccl_comm's copy.
+class alignas(CACHELINE_SIZE) ccl_comm_internal {
 public:
-    static constexpr int invalid_rank = -1;
+    static void ccl_comm_reset_thread_barrier();
+    ccl_comm_internal() = delete;
+    ccl_comm_internal(const ccl_comm_internal& other) = delete;
+    ccl_comm_internal& operator=(const ccl_comm_internal& other) = delete;
+
+    ccl_comm_internal(int rank, int size, std::shared_ptr<atl_base_comm> atl);
 
-    ccl::host_communicator* get_host_comm() {
-        return host_comm;
+    ccl_comm_internal(int rank,
+                      int size,
+                      ccl_rank2rank_map&& ranks,
+                      std::shared_ptr<atl_base_comm> atl);
+
+    //TODO non-implemented
+    //1) cluster_devices_count (devices 1000) -> (processes 10)
+    //2) blocking until all thread -> calls ccl_comm
+    //3) return 'thread_count'
+
+    // ccl_comm( {0,1,2,3...}, 1000, kvs )
+    // from 20 processes from ranks 0,1,2,3. Each rank contains 10 threads
+    // communicator: size in {20} and ranks in {0..19}
+    // communicator: return threads count in process {10}
+    // communicator: return devices counts per thread in process
+    ccl_comm_internal(const std::vector<int>& local_ranks,
+                      int comm_size,
+                      std::shared_ptr<ccl::kvs_interface> kvs_instance);
+
+    ~ccl_comm_internal() = default;
+
+    int rank() const noexcept {
+        return m_rank;
     }
 
-    static void ccl_comm_reset_thread_barrier();
-    ccl_comm() = delete;
-    ccl_comm(const ccl_comm& other) = delete;
-    ccl_comm& operator=(const ccl_comm& other) = delete;
+    int size() const noexcept {
+        return m_size;
+    }
+
+    int pof2() const noexcept {
+        return m_pof2;
+    }
+
+    const ccl_double_tree& dtree() const {
+        return m_dtree;
+    }
+
+    void reset(int rank, int size) {
+        m_rank = rank;
+        m_size = size;
+        m_pof2 = ccl_pof2(m_size);
+    }
+
+    const ccl_rank2rank_map& get_local2global_map() {
+        return m_local2global_map;
+    }
+
+    /**
+     * Maximum available number of active communicators
+     */
+    static constexpr ccl_sched_id_t max_comm_count = std::numeric_limits<ccl_comm_id_t>::max();
+    /**
+     * Maximum value of schedule id in scope of the current communicator
+     */
+    static constexpr ccl_sched_id_t max_sched_count = std::numeric_limits<ccl_sched_id_t>::max();
 
+    std::shared_ptr<atl_base_comm> atl;
+    std::unique_ptr<ccl_unordered_coll_manager> unordered_coll_manager;
+    std::unique_ptr<ccl_allreduce_2d_builder> allreduce_2d_builder;
+
+private:
+    int m_rank;
+    int m_size;
+    int m_pof2;
+
+    ccl_rank2rank_map m_local2global_map{};
+    ccl_double_tree m_dtree;
+};
+
+class alignas(CACHELINE_SIZE) ccl_comm : public ccl::communicator_interface {
+public:
+    using traits = ccl::host_communicator_traits;
+
+    // traits
+    bool is_host() const noexcept override {
+        return traits::is_host();
+    }
+
+    bool is_cpu() const noexcept override {
+        return traits::is_cpu();
+    }
+
+    bool is_gpu() const noexcept override {
+        return traits::is_gpu();
+    }
+
+    bool is_accelerator() const noexcept override {
+        return traits::is_accelerator();
+    }
+
+    bool is_ready() const override {
+        return true;
+    }
+
+    ccl::device_index_type get_device_path() const override;
+    ccl::communicator_interface::device_t get_device() const override;
+    ccl::communicator_interface::context_t get_context() const override;
+
+    const ccl::comm_split_attr& get_comm_split_attr() const override {
+        return comm_attr;
+    }
+
+    ccl::group_split_type get_topology_type() const override {
+        CCL_THROW(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+        return ccl::group_split_type::undetermined;
+    }
+
+    ccl::device_topology_type get_topology_class() const override {
+        CCL_THROW(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+        return ccl::device_topology_type::undetermined;
+    }
+
+    ccl::communicator_interface_ptr split(const ccl::comm_split_attr& attr) override;
+
+    // collectives operation declarations
+    ccl::event barrier(const ccl::stream::impl_value_t& op_stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps = {}) override;
+    ccl::event barrier_impl(const ccl::stream::impl_value_t& op_stream,
+                            const ccl::barrier_attr& attr,
+                            const ccl::vector_class<ccl::event>& deps = {});
+
+    COMM_INTERFACE_COLL_METHODS(DEFINITION);
+#ifdef CCL_ENABLE_SYCL
+    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
+#endif // CCL_ENABLE_SYCL
+
+    COMM_IMPL_DECLARATION;
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION;
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
+
+    ccl_comm();
+    ccl_comm(int size, ccl::shared_ptr_class<ikvs_wrapper> kvs);
+    ccl_comm(int size, int rank, ccl::shared_ptr_class<ikvs_wrapper> kvs);
+    ccl_comm(ccl::unified_device_type&& device,
+             ccl::unified_context_type&& context,
+             std::shared_ptr<atl_base_comm> atl);
+    ccl_comm(std::shared_ptr<atl_base_comm> atl);
+
+public:
     ccl_comm(int rank,
              int size,
              ccl_comm_id_storage::comm_id&& id,
-             std::shared_ptr<atl_wrapper> atl,
+             std::shared_ptr<atl_base_comm> atl,
              bool share_resources = false,
-             ccl::host_communicator* host_comm = nullptr);
+             bool is_sub_communicator = false);
 
     ccl_comm(int rank,
              int size,
              ccl_comm_id_storage::comm_id&& id,
              ccl_rank2rank_map&& ranks,
-             std::shared_ptr<atl_wrapper> atl,
+             std::shared_ptr<atl_base_comm> atl,
              bool share_resources = false,
-             ccl::host_communicator* host_comm = nullptr);
+             bool is_sub_communicator = false);
 
     //TODO non-implemented
     //1) cluster_devices_count (devices 1000) -> (processes 10)
@@ -80,51 +245,83 @@ class alignas(CACHELINE_SIZE) ccl_comm {
              std::shared_ptr<ccl::kvs_interface> kvs_instance,
              ccl_comm_id_storage::comm_id&& id,
              bool share_resources = false,
-             ccl::host_communicator* host_comm = nullptr);
+             bool is_sub_communicator = false);
+
+private:
+    // This is copy-constructor alike which basically means to copy-construct from src
+    // but replace m_id with id's value.
+    // We can't have a simple copy constructor here due to comm_id type limitation
+    ccl_comm(const ccl_comm& src, ccl_comm_id_storage::comm_id&& id);
 
+public:
+    ccl_comm(ccl_comm& src) = delete;
+    ccl_comm(ccl_comm&& src) = default;
+    ccl_comm& operator=(ccl_comm& src) = delete;
+    ccl_comm& operator=(ccl_comm&& src) = default;
     ~ccl_comm() = default;
+    std::shared_ptr<atl_base_comm> get_atl_comm() const;
+    std::shared_ptr<ccl_comm> get_r2r_comm();
+    std::shared_ptr<ccl_comm> get_node_comm();
+    std::shared_ptr<ccl_comm> get_even_comm();
+    std::shared_ptr<ccl_comm> get_pair_comm();
 
-    /* version with user-provided colors, allows to skip allgatherv */
-    static ccl_comm* create_with_colors(const std::vector<int>& colors,
-                                        ccl_comm_id_storage* comm_ids,
-                                        const ccl_comm* parent_comm,
-                                        bool share_resources = false);
+    // troubleshooting
+    std::string to_string() const;
+    std::string to_string_ext() const;
 
-    std::shared_ptr<ccl_comm> clone_with_new_id(ccl_comm_id_storage::comm_id&& id);
+    static constexpr int invalid_rank = -1;
 
-    int rank() const noexcept {
-        return m_rank;
+    /**
+     * Returns the number of @c rank in the global communicator
+     * @param rank a rank which is part of the current communicator
+     * @return number of @c rank in the global communicator
+     */
+    int get_global_rank(int rank, bool only_global = false) const;
+    int get_rank_from_global(int global_rank) const;
+
+    int rank() const override {
+        return comm_rank;
     }
 
-    int size() const noexcept {
-        return m_size;
+    int size() const override {
+        return comm_size;
     }
 
     int pof2() const noexcept {
-        return m_pof2;
+        return comm_impl->pof2();
     }
 
     ccl_comm_id_t id() const noexcept {
-        return m_id.value();
+        return comm_id->value();
     }
 
-    size_t thread_count() const noexcept {
-        return thread_number;
+    const ccl_double_tree& dtree() const {
+        return comm_impl->dtree();
     }
 
-    size_t ranks_per_process() const noexcept {
-        return on_process_ranks_number;
+    std::unique_ptr<ccl_unordered_coll_manager>& get_unordered_coll_manager() {
+        return comm_impl->unordered_coll_manager;
     }
+    std::unique_ptr<ccl_allreduce_2d_builder>& get_allreduce_2d_builder() {
+        return comm_impl->allreduce_2d_builder;
+    }
+
+    ccl_comm* create_with_color(int color,
+                                ccl_comm_id_storage* comm_ids,
+                                bool share_resources) const;
+
+    std::shared_ptr<ccl_comm> clone_with_new_id(ccl_comm_id_storage::comm_id&& id);
 
     ccl_sched_id_t get_sched_id(bool use_internal_space) {
         ccl_sched_id_t& next_sched_id =
-            (use_internal_space) ? m_next_sched_id_internal : m_next_sched_id_external;
+            (use_internal_space) ? next_sched_id_internal : next_sched_id_external;
 
-        ccl_sched_id_t first_sched_id =
-            (use_internal_space) ? static_cast<ccl_sched_id_t>(0) : ccl_comm::max_sched_count / 2;
+        ccl_sched_id_t first_sched_id = (use_internal_space)
+                                            ? static_cast<ccl_sched_id_t>(0)
+                                            : ccl_comm_internal::max_sched_count / 2;
 
-        ccl_sched_id_t max_sched_id =
-            (use_internal_space) ? ccl_comm::max_sched_count / 2 : ccl_comm::max_sched_count;
+        ccl_sched_id_t max_sched_id = (use_internal_space) ? ccl_comm_internal::max_sched_count / 2
+                                                           : ccl_comm_internal::max_sched_count;
 
         ccl_sched_id_t id = next_sched_id;
 
@@ -135,58 +332,50 @@ class alignas(CACHELINE_SIZE) ccl_comm {
             next_sched_id = first_sched_id;
         }
 
-        LOG_DEBUG("sched_id ", id, ", comm_id ", m_id.value(), ", next sched_id ", next_sched_id);
+        LOG_DEBUG("sched_id ", id, ", comm_id ", this->id(), ", next sched_id ", next_sched_id);
 
         return id;
     }
 
-    void reset(int rank, int size) {
-        m_rank = rank;
-        m_size = size;
-        m_pof2 = ccl_pof2(m_size);
-
-        m_next_sched_id_internal = ccl_comm::max_sched_count / 2;
-        m_next_sched_id_external = 0;
-    }
-
-    /**
-     * Returns the number of @c rank in the global communicator
-     * @param rank a rank which is part of the current communicator
-     * @return number of @c rank in the global communicator
-     */
-    int get_global_rank(int rank) const;
-
-    const ccl_double_tree& dtree() const {
-        return m_dtree;
-    }
-
     /**
      * Maximum available number of active communicators
      */
-    static constexpr ccl_sched_id_t max_comm_count = std::numeric_limits<ccl_comm_id_t>::max();
+    static constexpr ccl_sched_id_t max_comm_count = ccl_comm_internal::max_comm_count;
     /**
      * Maximum value of schedule id in scope of the current communicator
      */
-    static constexpr ccl_sched_id_t max_sched_count = std::numeric_limits<ccl_sched_id_t>::max();
+    static constexpr ccl_sched_id_t max_sched_count = ccl_comm_internal::max_sched_count;
 
-    std::shared_ptr<atl_wrapper> atl;
-    std::unique_ptr<ccl_unordered_coll_manager> unordered_coll_manager;
-    std::unique_ptr<ccl_allreduce_2d_builder> allreduce_2d_builder;
-
-private:
     void allocate_resources();
 
-    int m_rank;
-    int m_size;
-    int m_pof2;
-
-    ccl_comm_id_storage::comm_id m_id;
-    ccl_sched_id_t m_next_sched_id_internal;
-    ccl_sched_id_t m_next_sched_id_external;
-    ccl_rank2rank_map m_local2global_map{};
-    ccl_double_tree m_dtree;
+private:
+    // This is an internal part of the communicator, we store there only the fileds should be shared
+    // across ccl_comm copies/clones. Everything else must go to ccl_comm.
+    std::shared_ptr<ccl_comm_internal> comm_impl;
+
+    ccl::unified_device_type device;
+    ccl::unified_context_type context;
+
+    // TODO: double check if these can be moved to comm_impl as shared fields
+    std::shared_ptr<ccl_comm> r2r_comm;
+    std::shared_ptr<ccl_comm> node_comm;
+    std::shared_ptr<ccl_comm> even_comm;
+    std::shared_ptr<ccl_comm> pair_comm;
+    ccl::comm_split_attr comm_attr;
+
+    // these fields are duplicate with the ones in ccl_comm_internal, but having them here
+    // allows to get them without going through the shared_ptr inderection.
+    int comm_rank;
+    int comm_size;
+
+    // comm_id is not default constructible but ccl_comm is, so use unique_ptr here
+    std::unique_ptr<ccl_comm_id_storage::comm_id> comm_id;
+    ccl_sched_id_t next_sched_id_internal;
+    ccl_sched_id_t next_sched_id_external;
+
+    ccl_comm* get_impl() {
+        return this;
+    }
 
-    size_t thread_number;
-    size_t on_process_ranks_number;
-    ccl::host_communicator* host_comm;
-};
+    void create_sub_comms(std::shared_ptr<atl_base_comm> atl);
+}; // class ccl_comm
diff --git a/src/common/comm/comm_id_storage.hpp b/src/common/comm/comm_id_storage.hpp
index 627bc7c9b..6254a161f 100644
--- a/src/common/comm/comm_id_storage.hpp
+++ b/src/common/comm/comm_id_storage.hpp
@@ -20,9 +20,6 @@
 #include "common/utils/spinlock.hpp"
 
 #include <functional>
-#include <iostream>
-#include <limits>
-#include <mutex>
 #include <vector>
 
 using ccl_comm_id_t = uint16_t;
diff --git a/src/common/comm/host_communicator/host_communicator_impl.hpp b/src/common/comm/comm_impl.hpp
similarity index 54%
rename from src/common/comm/host_communicator/host_communicator_impl.hpp
rename to src/common/comm/comm_impl.hpp
index 00d8bd879..c5798f532 100644
--- a/src/common/comm/host_communicator/host_communicator_impl.hpp
+++ b/src/common/comm/comm_impl.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "common/comm/host_communicator/host_communicator.hpp"
+#include "common/comm/comm.hpp"
 
 #include "oneapi/ccl/native_device_api/interop_utils.hpp"
 #include "common/request/request.hpp"
@@ -25,24 +25,22 @@
 #include "coll/coll.hpp"
 #include "coll/coll_common_attributes.hpp"
 
-namespace ccl {
-
 /* allgatherv */
 template <class buffer_type>
-ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                              size_t send_count,
-                                              buffer_type* recv_buf,
-                                              const ccl::vector_class<size_t>& recv_counts,
-                                              const ccl::stream::impl_value_t& stream,
-                                              const ccl::allgatherv_attr& attr,
-                                              const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::allgatherv_impl(const buffer_type* send_buf,
+                                     size_t send_count,
+                                     buffer_type* recv_buf,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     const ccl::stream::impl_value_t& stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
                                            send_count,
                                            reinterpret_cast<void*>(recv_buf),
                                            recv_counts.data(),
                                            ccl::native_type_info<buffer_type>::dtype,
                                            attr,
-                                           comm_impl.get(),
+                                           this,
                                            get_stream_ptr(stream),
                                            deps);
 
@@ -50,13 +48,13 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                              size_t send_count,
-                                              ccl::vector_class<buffer_type*>& recv_bufs,
-                                              const ccl::vector_class<size_t>& recv_counts,
-                                              const ccl::stream::impl_value_t& stream,
-                                              const ccl::allgatherv_attr& attr,
-                                              const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::allgatherv_impl(const buffer_type* send_buf,
+                                     size_t send_count,
+                                     ccl::vector_class<buffer_type*>& recv_bufs,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     const ccl::stream::impl_value_t& stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
     internal_attr.is_vector_buf = 1;
 
@@ -66,7 +64,7 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
                                            recv_counts.data(),
                                            ccl::native_type_info<buffer_type>::dtype,
                                            internal_attr,
-                                           comm_impl.get(),
+                                           this,
                                            get_stream_ptr(stream),
                                            deps);
 
@@ -74,13 +72,13 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::allgatherv_impl(const buffer_type& send_buf,
-                                              size_t send_count,
-                                              buffer_type& recv_buf,
-                                              const ccl::vector_class<size_t>& recv_counts,
-                                              const ccl::stream::impl_value_t& stream,
-                                              const ccl::allgatherv_attr& attr,
-                                              const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::allgatherv_impl(const buffer_type& send_buf,
+                                     size_t send_count,
+                                     buffer_type& recv_buf,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     const ccl::stream::impl_value_t& stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -91,14 +89,14 @@ ccl::event host_communicator::allgatherv_impl(const buffer_type& send_buf,
                                            recv_counts.data(),
                                            ccl::native_type_info<buffer_type>::dtype,
                                            internal_attr,
-                                           comm_impl.get(),
+                                           this,
                                            get_stream_ptr(stream),
                                            deps);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 template <class buffer_type>
-ccl::event host_communicator::allgatherv_impl(
+ccl::event ccl_comm::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_bufs,
@@ -117,7 +115,7 @@ ccl::event host_communicator::allgatherv_impl(
                                            recv_counts.data(),
                                            ccl::native_type_info<buffer_type>::dtype,
                                            internal_attr,
-                                           comm_impl.get(),
+                                           this,
                                            get_stream_ptr(stream),
                                            deps);
 
@@ -126,20 +124,20 @@ ccl::event host_communicator::allgatherv_impl(
 
 /* allreduce */
 template <class buffer_type>
-ccl::event host_communicator::allreduce_impl(const buffer_type* send_buf,
-                                             buffer_type* recv_buf,
-                                             size_t count,
-                                             ccl::reduction reduction,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::allreduce_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::allreduce_impl(const buffer_type* send_buf,
+                                    buffer_type* recv_buf,
+                                    size_t count,
+                                    ccl::reduction reduction,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::allreduce_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allreduce_impl(reinterpret_cast<const void*>(send_buf),
                                           reinterpret_cast<void*>(recv_buf),
                                           count,
                                           ccl::native_type_info<buffer_type>::dtype,
                                           reduction,
                                           attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -147,13 +145,13 @@ ccl::event host_communicator::allreduce_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::allreduce_impl(const buffer_type& send_buf,
-                                             buffer_type& recv_buf,
-                                             size_t count,
-                                             ccl::reduction reduction,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::allreduce_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::allreduce_impl(const buffer_type& send_buf,
+                                    buffer_type& recv_buf,
+                                    size_t count,
+                                    ccl::reduction reduction,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::allreduce_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -164,7 +162,7 @@ ccl::event host_communicator::allreduce_impl(const buffer_type& send_buf,
                                           ccl::native_type_info<buffer_type>::dtype,
                                           reduction,
                                           internal_attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -173,18 +171,18 @@ ccl::event host_communicator::allreduce_impl(const buffer_type& send_buf,
 
 /* alltoall */
 template <class buffer_type>
-ccl::event host_communicator::alltoall_impl(const buffer_type* send_buf,
-                                            buffer_type* recv_buf,
-                                            size_t count,
-                                            const ccl::stream::impl_value_t& stream,
-                                            const ccl::alltoall_attr& attr,
-                                            const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::alltoall_impl(const buffer_type* send_buf,
+                                   buffer_type* recv_buf,
+                                   size_t count,
+                                   const ccl::stream::impl_value_t& stream,
+                                   const ccl::alltoall_attr& attr,
+                                   const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoall_impl(reinterpret_cast<const void*>(send_buf),
                                          reinterpret_cast<void*>(recv_buf),
                                          count,
                                          ccl::native_type_info<buffer_type>::dtype,
                                          attr,
-                                         comm_impl.get(),
+                                         this,
                                          get_stream_ptr(stream),
                                          deps);
 
@@ -192,23 +190,23 @@ ccl::event host_communicator::alltoall_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
-                                            const ccl::vector_class<buffer_type*>& recv_buf,
-                                            size_t count,
-                                            const ccl::stream::impl_value_t& stream,
-                                            const ccl::alltoall_attr& attr,
-                                            const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                   const ccl::vector_class<buffer_type*>& recv_buf,
+                                   size_t count,
+                                   const ccl::stream::impl_value_t& stream,
+                                   const ccl::alltoall_attr& attr,
+                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoall_impl(const buffer_type& send_buf,
-                                            buffer_type& recv_buf,
-                                            size_t count,
-                                            const ccl::stream::impl_value_t& stream,
-                                            const ccl::alltoall_attr& attr,
-                                            const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::alltoall_impl(const buffer_type& send_buf,
+                                   buffer_type& recv_buf,
+                                   size_t count,
+                                   const ccl::stream::impl_value_t& stream,
+                                   const ccl::alltoall_attr& attr,
+                                   const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -218,7 +216,7 @@ ccl::event host_communicator::alltoall_impl(const buffer_type& send_buf,
                                          count,
                                          ccl::native_type_info<buffer_type>::dtype,
                                          internal_attr,
-                                         comm_impl.get(),
+                                         this,
                                          get_stream_ptr(stream),
                                          deps);
 
@@ -226,7 +224,7 @@ ccl::event host_communicator::alltoall_impl(const buffer_type& send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoall_impl(
+ccl::event ccl_comm::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -239,20 +237,20 @@ ccl::event host_communicator::alltoall_impl(
 
 /* alltoallv */
 template <class buffer_type>
-ccl::event host_communicator::alltoallv_impl(const buffer_type* send_buf,
-                                             const ccl::vector_class<size_t>& send_counts,
-                                             buffer_type* recv_buf,
-                                             const ccl::vector_class<size_t>& recv_counts,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::alltoallv_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::alltoallv_impl(const buffer_type* send_buf,
+                                    const ccl::vector_class<size_t>& send_counts,
+                                    buffer_type* recv_buf,
+                                    const ccl::vector_class<size_t>& recv_counts,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::alltoallv_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoallv_impl(reinterpret_cast<const void*>(send_buf),
                                           send_counts.data(),
                                           reinterpret_cast<void*>(recv_buf),
                                           recv_counts.data(),
                                           ccl::native_type_info<buffer_type>::dtype,
                                           attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -260,25 +258,25 @@ ccl::event host_communicator::alltoallv_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoallv_impl(const ccl::vector_class<buffer_type*>& send_buf,
-                                             const ccl::vector_class<size_t>& send_counts,
-                                             const ccl::vector_class<buffer_type*>& recv_buf,
-                                             const ccl::vector_class<size_t>& recv_counts,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::alltoallv_attr& attr,
-                                             const ccl::vector_class<ccl::event>& dep) {
+ccl::event ccl_comm::alltoallv_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                    const ccl::vector_class<size_t>& send_counts,
+                                    const ccl::vector_class<buffer_type*>& recv_buf,
+                                    const ccl::vector_class<size_t>& recv_counts,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::alltoallv_attr& attr,
+                                    const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoallv_impl(const buffer_type& send_buf,
-                                             const ccl::vector_class<size_t>& send_counts,
-                                             buffer_type& recv_buf,
-                                             const ccl::vector_class<size_t>& recv_counts,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::alltoallv_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::alltoallv_impl(const buffer_type& send_buf,
+                                    const ccl::vector_class<size_t>& send_counts,
+                                    buffer_type& recv_buf,
+                                    const ccl::vector_class<size_t>& recv_counts,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::alltoallv_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -289,7 +287,7 @@ ccl::event host_communicator::alltoallv_impl(const buffer_type& send_buf,
                                           recv_counts.data(),
                                           ccl::native_type_info<buffer_type>::dtype,
                                           internal_attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -297,7 +295,7 @@ ccl::event host_communicator::alltoallv_impl(const buffer_type& send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::alltoallv_impl(
+ccl::event ccl_comm::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -311,18 +309,18 @@ ccl::event host_communicator::alltoallv_impl(
 
 /* bcast */
 template <class buffer_type>
-ccl::event host_communicator::broadcast_impl(buffer_type* buf,
-                                             size_t count,
-                                             int root,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::broadcast_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::broadcast_impl(buffer_type* buf,
+                                    size_t count,
+                                    int root,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::broadcast_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_broadcast_impl(reinterpret_cast<void*>(buf),
                                           count,
                                           ccl::native_type_info<buffer_type>::dtype,
                                           root,
                                           attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -330,12 +328,12 @@ ccl::event host_communicator::broadcast_impl(buffer_type* buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::broadcast_impl(buffer_type& buf,
-                                             size_t count,
-                                             int root,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::broadcast_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::broadcast_impl(buffer_type& buf,
+                                    size_t count,
+                                    int root,
+                                    const ccl::stream::impl_value_t& stream,
+                                    const ccl::broadcast_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -345,7 +343,7 @@ ccl::event host_communicator::broadcast_impl(buffer_type& buf,
                                           ccl::native_type_info<buffer_type>::dtype,
                                           root,
                                           internal_attr,
-                                          comm_impl.get(),
+                                          this,
                                           get_stream_ptr(stream),
                                           deps);
 
@@ -354,14 +352,14 @@ ccl::event host_communicator::broadcast_impl(buffer_type& buf,
 
 /* reduce */
 template <class buffer_type>
-ccl::event host_communicator::reduce_impl(const buffer_type* send_buf,
-                                          buffer_type* recv_buf,
-                                          size_t count,
-                                          ccl::reduction reduction,
-                                          int root,
-                                          const ccl::stream::impl_value_t& stream,
-                                          const ccl::reduce_attr& attr,
-                                          const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::reduce_impl(const buffer_type* send_buf,
+                                 buffer_type* recv_buf,
+                                 size_t count,
+                                 ccl::reduction reduction,
+                                 int root,
+                                 const ccl::stream::impl_value_t& stream,
+                                 const ccl::reduce_attr& attr,
+                                 const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_impl(reinterpret_cast<const void*>(send_buf),
                                        reinterpret_cast<void*>(recv_buf),
                                        count,
@@ -369,7 +367,7 @@ ccl::event host_communicator::reduce_impl(const buffer_type* send_buf,
                                        reduction,
                                        root,
                                        attr,
-                                       comm_impl.get(),
+                                       this,
                                        get_stream_ptr(stream),
                                        deps);
 
@@ -377,14 +375,14 @@ ccl::event host_communicator::reduce_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::reduce_impl(const buffer_type& send_buf,
-                                          buffer_type& recv_buf,
-                                          size_t count,
-                                          ccl::reduction reduction,
-                                          int root,
-                                          const ccl::stream::impl_value_t& stream,
-                                          const ccl::reduce_attr& attr,
-                                          const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::reduce_impl(const buffer_type& send_buf,
+                                 buffer_type& recv_buf,
+                                 size_t count,
+                                 ccl::reduction reduction,
+                                 int root,
+                                 const ccl::stream::impl_value_t& stream,
+                                 const ccl::reduce_attr& attr,
+                                 const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -396,7 +394,7 @@ ccl::event host_communicator::reduce_impl(const buffer_type& send_buf,
                                        reduction,
                                        root,
                                        internal_attr,
-                                       comm_impl.get(),
+                                       this,
                                        get_stream_ptr(stream),
                                        deps);
 
@@ -405,20 +403,20 @@ ccl::event host_communicator::reduce_impl(const buffer_type& send_buf,
 
 /* reduce_scatter */
 template <class buffer_type>
-ccl::event host_communicator::reduce_scatter_impl(const buffer_type* send_buf,
-                                                  buffer_type* recv_buf,
-                                                  size_t recv_count,
-                                                  ccl::reduction reduction,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_scatter_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::reduce_scatter_impl(const buffer_type* send_buf,
+                                         buffer_type* recv_buf,
+                                         size_t recv_count,
+                                         ccl::reduction reduction,
+                                         const ccl::stream::impl_value_t& stream,
+                                         const ccl::reduce_scatter_attr& attr,
+                                         const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_scatter_impl(reinterpret_cast<const void*>(send_buf),
                                                reinterpret_cast<void*>(recv_buf),
                                                recv_count,
                                                ccl::native_type_info<buffer_type>::dtype,
                                                reduction,
                                                attr,
-                                               comm_impl.get(),
+                                               this,
                                                get_stream_ptr(stream),
                                                deps);
 
@@ -426,13 +424,13 @@ ccl::event host_communicator::reduce_scatter_impl(const buffer_type* send_buf,
 }
 
 template <class buffer_type>
-ccl::event host_communicator::reduce_scatter_impl(const buffer_type& send_buf,
-                                                  buffer_type& recv_buf,
-                                                  size_t recv_count,
-                                                  ccl::reduction reduction,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_scatter_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::reduce_scatter_impl(const buffer_type& send_buf,
+                                         buffer_type& recv_buf,
+                                         size_t recv_count,
+                                         ccl::reduction reduction,
+                                         const ccl::stream::impl_value_t& stream,
+                                         const ccl::reduce_scatter_attr& attr,
+                                         const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
 #ifdef CCL_ENABLE_SYCL
     internal_attr.is_sycl_buf = 1;
@@ -443,7 +441,7 @@ ccl::event host_communicator::reduce_scatter_impl(const buffer_type& send_buf,
                                                ccl::native_type_info<buffer_type>::dtype,
                                                reduction,
                                                internal_attr,
-                                               comm_impl.get(),
+                                               this,
                                                get_stream_ptr(stream),
                                                deps);
 
@@ -452,18 +450,18 @@ ccl::event host_communicator::reduce_scatter_impl(const buffer_type& send_buf,
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* send_ind_buf,
-                                                    size_t send_ind_count,
-                                                    const value_buffer_type* send_val_buf,
-                                                    size_t send_val_count,
-                                                    index_buffer_type* recv_ind_buf,
-                                                    size_t recv_ind_count,
-                                                    value_buffer_type* recv_val_buf,
-                                                    size_t recv_val_count,
-                                                    ccl::reduction reduction,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::sparse_allreduce_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::sparse_allreduce_impl(const index_buffer_type* send_ind_buf,
+                                           size_t send_ind_count,
+                                           const value_buffer_type* send_val_buf,
+                                           size_t send_val_count,
+                                           index_buffer_type* recv_ind_buf,
+                                           size_t recv_ind_count,
+                                           value_buffer_type* recv_val_buf,
+                                           size_t recv_val_count,
+                                           ccl::reduction reduction,
+                                           const ccl::stream::impl_value_t& stream,
+                                           const ccl::sparse_allreduce_attr& attr,
+                                           const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_sparse_allreduce_impl((const void*)send_ind_buf,
                                                  send_ind_count,
                                                  (const void*)send_val_buf,
@@ -476,7 +474,7 @@ ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* sen
                                                  ccl::native_type_info<value_buffer_type>::dtype,
                                                  reduction,
                                                  attr,
-                                                 comm_impl.get(),
+                                                 this,
                                                  get_stream_ptr(stream),
                                                  deps);
 
@@ -484,20 +482,18 @@ ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* sen
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_container_type& send_ind_buf,
-                                                    size_t send_ind_count,
-                                                    const value_buffer_container_type& send_val_buf,
-                                                    size_t send_val_count,
-                                                    index_buffer_container_type& recv_ind_buf,
-                                                    size_t recv_ind_count,
-                                                    value_buffer_container_type& recv_val_buf,
-                                                    size_t recv_val_count,
-                                                    ccl::reduction reduction,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::sparse_allreduce_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event ccl_comm::sparse_allreduce_impl(const index_buffer_container_type& send_ind_buf,
+                                           size_t send_ind_count,
+                                           const value_buffer_container_type& send_val_buf,
+                                           size_t send_val_count,
+                                           index_buffer_container_type& recv_ind_buf,
+                                           size_t recv_ind_count,
+                                           value_buffer_container_type& recv_val_buf,
+                                           size_t recv_val_count,
+                                           ccl::reduction reduction,
+                                           const ccl::stream::impl_value_t& stream,
+                                           const ccl::sparse_allreduce_attr& attr,
+                                           const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-
-} // namespace ccl
diff --git a/src/common/comm/comm_interface.hpp b/src/common/comm/comm_interface.hpp
index a72256413..8b230a00d 100644
--- a/src/common/comm/comm_interface.hpp
+++ b/src/common/comm/comm_interface.hpp
@@ -29,7 +29,6 @@
 #include "oneapi/ccl/stream.hpp"
 
 #include "common/comm/compiler_comm_interface_dispatcher.hpp"
-#include "common/comm/l0/comm_context_id.hpp"
 #include "internal_types.hpp"
 
 namespace native {
@@ -48,8 +47,6 @@ class reduce_attr;
 class reduce_scatter_attr;
 class sparse_allreduce_attr;
 } // namespace v1
-
-struct gpu_comm_attr;
 } // namespace ccl
 
 #include "types_generator_defines.hpp"
@@ -149,8 +146,6 @@ struct communicator_interface : public communicator_interface_dispatcher {
 
     virtual bool is_ready() const = 0;
 
-    virtual const group_unique_key& get_comm_group_id() const = 0;
-
     virtual ccl::communicator_interface_ptr split(const ccl::comm_split_attr& attr) = 0;
 
     // collectives operation declarations
diff --git a/src/common/comm/communicator_traits.hpp b/src/common/comm/communicator_traits.hpp
index cf0ea5d3e..b91629954 100644
--- a/src/common/comm/communicator_traits.hpp
+++ b/src/common/comm/communicator_traits.hpp
@@ -47,15 +47,4 @@ struct host_communicator_traits : base_communicator_traits<host_sign, empty, emp
     }
 };
 
-struct cpu_communicator_traits : base_communicator_traits<empty, cpu_sign, empty, empty> {
-    static constexpr const char* name() {
-        return "cpu communicator";
-    }
-};
-
-struct gpu_communicator_traits : base_communicator_traits<empty, empty, gpu_sign, empty> {
-    static constexpr const char* name() {
-        return "gpu communicator";
-    }
-};
 } // namespace ccl
diff --git a/src/common/comm/compiler_comm_interface_dispatcher.cpp b/src/common/comm/compiler_comm_interface_dispatcher.cpp
index 3c1373925..52d486e25 100644
--- a/src/common/comm/compiler_comm_interface_dispatcher.cpp
+++ b/src/common/comm/compiler_comm_interface_dispatcher.cpp
@@ -30,29 +30,29 @@
 
 #include "common/global/global.hpp"
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
 #include "supported_topologies.hpp"
 #endif
 
-#include "common/comm/host_communicator/host_communicator_impl.hpp"
+#include "common/comm/comm_impl.hpp"
 
 namespace ccl {
 
 communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl() {
-    return communicator_interface_ptr(new host_communicator());
+    return communicator_interface_ptr(new ccl_comm());
 }
 
 communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(
     const size_t size,
     shared_ptr_class<ikvs_wrapper> kvs) {
-    return communicator_interface_ptr(new host_communicator(size, kvs));
+    return communicator_interface_ptr(new ccl_comm(size, kvs));
 }
 
 communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(
     const size_t size,
     const int rank,
     shared_ptr_class<ikvs_wrapper> kvs) {
-    return communicator_interface_ptr(new host_communicator(size, rank, kvs));
+    return communicator_interface_ptr(new ccl_comm(size, rank, kvs));
 }
 
 template <class DeviceType,
@@ -66,7 +66,7 @@ communicator_interface_ptr communicator_interface_dispatcher::create_communicato
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr,
-    std::shared_ptr<atl_wrapper> atl,
+    std::shared_ptr<atl_base_comm> atl,
     ccl::group_split_type preferred_topology_group /* = ccl::group_split_type::undetermined */) {
     static_assert(std::is_same<typename unified_device_type::ccl_native_t, DeviceType>::value,
                   "Unsupported 'DeviceType'");
@@ -92,7 +92,7 @@ communicator_interface_ptr communicator_interface_dispatcher::create_communicato
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr,
-    std::shared_ptr<atl_wrapper> atl,
+    std::shared_ptr<atl_base_comm> atl,
     ccl::group_split_type preferred_topology_group /* = ccl::group_split_type::undetermined */) {
 #ifdef CCL_ENABLE_SYCL
     return communicator_interface_dispatcher::create_communicator_from_unified_device(
@@ -122,7 +122,7 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr,
-    std::shared_ptr<atl_wrapper> atl,
+    std::shared_ptr<atl_base_comm> atl,
     ccl::group_split_type preferred_topology_group /* = ccl::group_split_type::undetermined */) {
     if (preferred_topology_group == ccl::group_split_type::undetermined) {
         preferred_topology_group = ccl::group_split_type::cluster;
@@ -140,10 +140,10 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
     }
 
     switch (preferred_topology_group) {
-#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
         case ccl::group_split_type::single: {
             return communicator_interface_ptr(
-                new host_communicator(std::move(device_id), std::move(context), atl));
+                new ccl_comm(std::move(device_id), std::move(context), atl));
         }
 #endif
         default:
@@ -164,7 +164,7 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
         size_t thread_idx, \
         size_t process_idx, \
         const ccl::comm_split_attr& attr, \
-        std::shared_ptr<atl_wrapper> atl, \
+        std::shared_ptr<atl_base_comm> atl, \
         ccl::group_split_type \
             preferred_topology_group /* = ccl::group_split_type::undetermined */);
 
@@ -177,7 +177,7 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
         size_t thread_idx, \
         size_t process_idx, \
         const ccl::comm_split_attr& attr, \
-        std::shared_ptr<atl_wrapper> atl, \
+        std::shared_ptr<atl_base_comm> atl, \
         ccl::group_split_type \
             preferred_topology_group /* = ccl::group_split_type::undetermined */);
 
diff --git a/src/common/comm/compiler_comm_interface_dispatcher.hpp b/src/common/comm/compiler_comm_interface_dispatcher.hpp
index ad643f0a9..cb8ca8bb2 100644
--- a/src/common/comm/compiler_comm_interface_dispatcher.hpp
+++ b/src/common/comm/compiler_comm_interface_dispatcher.hpp
@@ -20,7 +20,7 @@
 #include "oneapi/ccl/types.hpp"
 #include "supported_topologies.hpp"
 #include "communicator_traits.hpp"
-#include "atl/atl_wrapper.h"
+#include "atl/atl_base_comm.hpp"
 
 namespace native {
 struct ccl_device;
@@ -30,9 +30,6 @@ namespace v1 {
 class comm_split_attr;
 }
 
-#ifdef MULTI_GPU_SUPPORT
-struct gpu_comm_attr;
-#endif
 struct communicator_interface;
 
 using communicator_interface_ptr = std::shared_ptr<communicator_interface>;
@@ -43,10 +40,6 @@ struct communicator_interface_dispatcher {
 
     virtual ~communicator_interface_dispatcher() = default;
 
-#ifdef MULTI_GPU_SUPPORT
-    virtual void visit(ccl::gpu_comm_attr& comm_attr) = 0;
-#endif //MULTI_GPU_SUPPORT
-
     virtual ccl::device_index_type get_device_path() const = 0;
     virtual device_t get_device() const = 0;
     virtual context_t get_context() const = 0;
@@ -66,7 +59,7 @@ struct communicator_interface_dispatcher {
         size_t thread_idx,
         size_t process_idx,
         const comm_split_attr& attr,
-        std::shared_ptr<atl_wrapper> atl,
+        std::shared_ptr<atl_base_comm> atl,
         ccl::group_split_type preferred_topology_group = ccl::group_split_type::undetermined);
 
     // create communicator for device & cpu types (from device index)
@@ -81,7 +74,7 @@ struct communicator_interface_dispatcher {
         size_t thread_idx,
         size_t process_idx,
         const comm_split_attr& attr,
-        std::shared_ptr<atl_wrapper> atl,
+        std::shared_ptr<atl_base_comm> atl,
         ccl::group_split_type preferred_topology_group = ccl::group_split_type::undetermined);
 
     // create communicator for host
@@ -103,7 +96,7 @@ struct communicator_interface_dispatcher {
         size_t thread_idx,
         size_t process_idx,
         const comm_split_attr& attr,
-        std::shared_ptr<atl_wrapper> atl,
+        std::shared_ptr<atl_base_comm> atl,
         ccl::group_split_type preferred_topology_group = ccl::group_split_type::undetermined);
 };
 } // namespace ccl
diff --git a/src/common/comm/host_communicator/host_communicator.cpp b/src/common/comm/host_communicator/host_communicator.cpp
deleted file mode 100644
index b0883492d..000000000
--- a/src/common/comm/host_communicator/host_communicator.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/global/global.hpp"
-#include "common/comm/host_communicator/host_communicator_impl.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
-#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/comm_split_attr.hpp"
-
-#include "common/request/request.hpp"
-#include "common/event/impls/host_event.hpp"
-#include "coll/coll.hpp"
-#include "coll/coll_common_attributes.hpp"
-#include "coll/ccl_allgather_op_attr.hpp"
-
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
-#include "atl/atl_wrapper.h"
-
-#include "common/comm/comm.hpp"
-
-#ifdef MULTI_GPU_SUPPORT
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#endif
-
-namespace ccl {
-
-using ccl::preview::create_comm_split_attr;
-
-host_communicator::host_communicator()
-        : device(ccl::device_index_type(ccl::unused_index_value,
-                                        ccl::unused_index_value,
-                                        ccl::unused_index_value)),
-          comm_attr(create_comm_split_attr()) {}
-
-host_communicator::host_communicator(int size, shared_ptr_class<ikvs_wrapper> kvs)
-        : device(ccl::device_index_type(ccl::unused_index_value,
-                                        ccl::unused_index_value,
-                                        ccl::unused_index_value)),
-          comm_attr(create_comm_split_attr()),
-          comm_rank(0),
-          comm_size(size) {
-    if (size <= 0) {
-        throw ccl::exception("Incorrect size value when creating a host communicator");
-    }
-}
-
-host_communicator::host_communicator(int size, int rank, shared_ptr_class<ikvs_wrapper> kvs)
-        : device(ccl::device_index_type(ccl::unused_index_value,
-                                        ccl::unused_index_value,
-                                        ccl::unused_index_value)),
-          comm_attr(create_comm_split_attr()),
-          comm_rank(rank),
-          comm_size(size) {
-    if (rank > size || size <= 0) {
-        throw ccl::exception("Incorrect rank or size value when creating a host communicator");
-    }
-
-    LOG_DEBUG("ctor");
-
-    ccl::global_data& data = ccl::global_data::get();
-    std::shared_ptr<atl_wrapper> atl_tmp =
-        std::shared_ptr<atl_wrapper>(new atl_wrapper(size, { rank }, kvs));
-    comm_impl = std::shared_ptr<ccl_comm>(
-        new ccl_comm(rank, size, data.comm_ids->acquire(), atl_tmp, false, this));
-    create_sub_comms(atl_tmp);
-}
-
-host_communicator::host_communicator(ccl::unified_device_type&& d,
-                                     ccl::unified_context_type&& c,
-                                     std::shared_ptr<atl_wrapper> atl)
-        : host_communicator(atl) {}
-
-host_communicator::host_communicator(std::shared_ptr<atl_wrapper> atl)
-        : device(ccl::device_index_type(ccl::unused_index_value,
-                                        ccl::unused_index_value,
-                                        ccl::unused_index_value)),
-          comm_attr(create_comm_split_attr()),
-          comm_rank(atl->get_rank()),
-          comm_size(atl->get_size()) {
-    int rank = atl->get_rank();
-    int size = atl->get_size();
-
-    if (rank > size || size <= 0) {
-        throw ccl::exception("incorrect rank or size when creating \
-                             a host communicator: rank: " +
-                             std::to_string(rank) + ", size: " + std::to_string(size));
-    }
-
-    LOG_DEBUG("ctor");
-
-    ccl::global_data& data = ccl::global_data::get();
-    comm_impl = std::shared_ptr<ccl_comm>(
-        new ccl_comm(rank, size, data.comm_ids->acquire(), atl, false, this));
-    create_sub_comms(atl);
-}
-
-host_communicator::host_communicator(std::shared_ptr<ccl_comm> impl, bool is_sub_communicator)
-        : comm_impl(impl),
-          device(ccl::device_index_type(ccl::unused_index_value,
-                                        ccl::unused_index_value,
-                                        ccl::unused_index_value)),
-          comm_attr(create_comm_split_attr()),
-          comm_rank(impl->rank()),
-          comm_size(impl->size()) {
-    if (!is_sub_communicator) {
-        create_sub_comms(comm_impl.get()->atl);
-    }
-}
-
-int host_communicator::rank() const {
-    return comm_rank;
-}
-
-int host_communicator::size() const {
-    return comm_size;
-}
-
-#ifdef MULTI_GPU_SUPPORT
-void host_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
-    (void)(comm_attr);
-}
-#endif
-
-ccl::device_index_type host_communicator::get_device_path() const {
-    return ccl::device_index_type{ ccl::unused_index_value,
-                                   ccl::unused_index_value,
-                                   ccl::unused_index_value };
-}
-
-ccl::communicator_interface::device_t host_communicator::get_device() const {
-    throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
-    static ccl::communicator_interface::device_t empty;
-    return empty;
-}
-
-ccl::communicator_interface::context_t host_communicator::get_context() const {
-    throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
-    static ccl::communicator_interface::context_t empty;
-    return empty;
-}
-
-void host_communicator::exchange_colors(std::vector<int>& colors) {
-    size_t send_count = 1;
-    vector_class<size_t> recv_counts(colors.size(), send_count);
-    auto attr =
-        create_operation_attr<allgatherv_attr>(attr_val<operation_attr_id::to_cache>(false));
-
-    this->allgatherv_impl(colors.data(), send_count, colors.data(), recv_counts, {}, attr, {})
-        .wait();
-}
-
-void host_communicator::create_sub_comms(std::shared_ptr<atl_wrapper> atl) {
-    bool is_sub_comm = true;
-    if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
-        r2r_comm =
-            std::shared_ptr<host_communicator>(new host_communicator(comm_impl, is_sub_comm));
-        node_comm =
-            std::shared_ptr<host_communicator>(new host_communicator(comm_impl, is_sub_comm));
-        pair_comm =
-            std::shared_ptr<host_communicator>(new host_communicator(comm_impl, is_sub_comm));
-        even_comm =
-            std::shared_ptr<host_communicator>(new host_communicator(comm_impl, is_sub_comm));
-    }
-    else {
-        ccl::global_data& data = ccl::global_data::get();
-        r2r_comm = std::shared_ptr<host_communicator>(
-            new host_communicator(std::shared_ptr<ccl_comm>(this->create_with_color(
-                                      atl->get_r2r_color(), data.comm_ids.get(), comm_impl.get())),
-                                  is_sub_comm));
-        node_comm = std::shared_ptr<host_communicator>(
-            new host_communicator(std::shared_ptr<ccl_comm>(this->create_with_color(
-                                      atl->get_host_color(), data.comm_ids.get(), comm_impl.get())),
-                                  is_sub_comm));
-        even_comm = std::shared_ptr<host_communicator>(new host_communicator(
-            std::shared_ptr<ccl_comm>(this->create_with_color(
-                atl->get_host_color() + atl->get_rank() % 2, data.comm_ids.get(), comm_impl.get())),
-            is_sub_comm));
-        pair_comm = std::shared_ptr<host_communicator>(new host_communicator(
-            std::shared_ptr<ccl_comm>(this->create_with_color(
-                atl->get_host_color() + atl->get_rank() / 2, data.comm_ids.get(), comm_impl.get())),
-            is_sub_comm));
-    }
-}
-
-ccl_comm* host_communicator::create_with_color(int color,
-                                               ccl_comm_id_storage* comm_ids,
-                                               const ccl_comm* parent_comm) {
-    if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
-        throw ccl::exception(
-            "MPI transport doesn't support creation of communicator with color yet");
-    }
-
-    std::vector<int> colors(this->size());
-    colors[this->rank()] = color;
-    this->exchange_colors(colors);
-
-    // TODO we can replace this func with own
-    return ccl_comm::create_with_colors(colors, comm_ids, parent_comm, true);
-}
-
-ccl::communicator_interface_ptr host_communicator::split(const comm_split_attr& attr) {
-    if (!attr.is_valid<comm_split_attr_id::color>()) {
-        throw ccl::exception(std::string(__FUNCTION__) +
-                             " - 'Color' split attribute for host communicator is not set");
-    }
-
-    ccl::global_data& data = ccl::global_data::get();
-    auto new_comm = this->create_with_color(
-        attr.get<ccl::comm_split_attr_id::color>(), data.comm_ids.get(), comm_impl.get());
-
-    comm_attr = attr;
-
-    return std::shared_ptr<host_communicator>(
-        new host_communicator(std::shared_ptr<ccl_comm>(new_comm)));
-}
-
-ccl::event host_communicator::barrier(const ccl::stream::impl_value_t& stream,
-                                      const ccl::barrier_attr& attr,
-                                      const ccl::vector_class<ccl::event>& deps) {
-    return get_impl()->barrier_impl(stream, attr, deps);
-}
-
-ccl::event host_communicator::barrier_impl(const ccl::stream::impl_value_t& stream,
-                                           const ccl::barrier_attr& attr,
-                                           const ccl::vector_class<ccl::event>& deps) {
-    ccl_barrier_impl(comm_impl.get(), stream.get(), deps);
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(nullptr));
-}
-
-/* allgatherv */
-ccl::event host_communicator::allgatherv_impl(const void* send_buf,
-                                              size_t send_count,
-                                              void* recv_buf,
-                                              const ccl::vector_class<size_t>& recv_counts,
-                                              ccl::datatype dtype,
-                                              const ccl::stream::impl_value_t& stream,
-                                              const ccl::allgatherv_attr& attr,
-                                              const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_allgatherv_impl(send_buf,
-                                           send_count,
-                                           recv_buf,
-                                           recv_counts.data(),
-                                           dtype,
-                                           attr,
-                                           comm_impl.get(),
-                                           get_stream_ptr(stream),
-                                           deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-ccl::event host_communicator::allgatherv_impl(const void* send_buf,
-                                              size_t send_count,
-                                              const ccl::vector_class<void*>& recv_bufs,
-                                              const ccl::vector_class<size_t>& recv_counts,
-                                              ccl::datatype dtype,
-                                              const ccl::stream::impl_value_t& stream,
-                                              const ccl::allgatherv_attr& attr,
-                                              const ccl::vector_class<ccl::event>& deps) {
-    ccl_coll_attr internal_attr(attr);
-    internal_attr.is_vector_buf = 1;
-
-    ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
-                                           send_count,
-                                           (void*)(recv_bufs.data()),
-                                           recv_counts.data(),
-                                           dtype,
-                                           internal_attr,
-                                           comm_impl.get(),
-                                           get_stream_ptr(stream),
-                                           deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-/* allreduce */
-ccl::event host_communicator::allreduce_impl(const void* send_buf,
-                                             void* recv_buf,
-                                             size_t count,
-                                             ccl::datatype dtype,
-                                             ccl::reduction reduction,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::allreduce_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_allreduce_impl(send_buf,
-                                          recv_buf,
-                                          count,
-                                          dtype,
-                                          reduction,
-                                          attr,
-                                          comm_impl.get(),
-                                          get_stream_ptr(stream),
-                                          deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-/* alltoall */
-ccl::event host_communicator::alltoall_impl(const void* send_buf,
-                                            void* recv_buf,
-                                            size_t count,
-                                            ccl::datatype dtype,
-                                            const ccl::stream::impl_value_t& stream,
-                                            const ccl::alltoall_attr& attr,
-                                            const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_alltoall_impl(
-        send_buf, recv_buf, count, dtype, attr, comm_impl.get(), get_stream_ptr(stream), deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-ccl::event host_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
-                                            const ccl::vector_class<void*>& recv_buf,
-                                            size_t count,
-                                            ccl::datatype dtype,
-                                            const ccl::stream::impl_value_t& stream,
-                                            const ccl::alltoall_attr& attr,
-                                            const ccl::vector_class<ccl::event>& deps) {
-    // TODO not implemented
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
-}
-
-/* alltoallv */
-ccl::event host_communicator::alltoallv_impl(const void* send_buf,
-                                             const ccl::vector_class<size_t>& send_counts,
-                                             void* recv_buf,
-                                             const ccl::vector_class<size_t>& recv_counts,
-                                             ccl::datatype dtype,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::alltoallv_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_alltoallv_impl(send_buf,
-                                          send_counts.data(),
-                                          recv_buf,
-                                          recv_counts.data(),
-                                          dtype,
-                                          attr,
-                                          comm_impl.get(),
-                                          get_stream_ptr(stream),
-                                          deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-ccl::event host_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
-                                             const ccl::vector_class<size_t>& send_counts,
-                                             ccl::vector_class<void*> recv_buf,
-                                             const ccl::vector_class<size_t>& recv_counts,
-                                             ccl::datatype dtype,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::alltoallv_attr& attr,
-                                             const ccl::vector_class<ccl::event>& dep) {
-    // TODO not implemented
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-    return {};
-}
-
-/* bcast */
-ccl::event host_communicator::broadcast_impl(void* buf,
-                                             size_t count,
-                                             ccl::datatype dtype,
-                                             int root,
-                                             const ccl::stream::impl_value_t& stream,
-                                             const ccl::broadcast_attr& attr,
-                                             const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_broadcast_impl(
-        buf, count, dtype, root, attr, comm_impl.get(), get_stream_ptr(stream), deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-/* reduce */
-ccl::event host_communicator::reduce_impl(const void* send_buf,
-                                          void* recv_buf,
-                                          size_t count,
-                                          ccl::datatype dtype,
-                                          ccl::reduction reduction,
-                                          int root,
-                                          const ccl::stream::impl_value_t& stream,
-                                          const ccl::reduce_attr& attr,
-                                          const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_reduce_impl(send_buf,
-                                       recv_buf,
-                                       count,
-                                       dtype,
-                                       reduction,
-                                       root,
-                                       attr,
-                                       comm_impl.get(),
-                                       get_stream_ptr(stream),
-                                       deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-/* reduce_scatter */
-ccl::event host_communicator::reduce_scatter_impl(const void* send_buf,
-                                                  void* recv_buf,
-                                                  size_t recv_count,
-                                                  ccl::datatype dtype,
-                                                  ccl::reduction reduction,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_scatter_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_reduce_scatter_impl(send_buf,
-                                               recv_buf,
-                                               recv_count,
-                                               dtype,
-                                               reduction,
-                                               attr,
-                                               comm_impl.get(),
-                                               get_stream_ptr(stream),
-                                               deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-/* sparse_allreduce */
-ccl::event host_communicator::sparse_allreduce_impl(const void* send_ind_buf,
-                                                    size_t send_ind_count,
-                                                    const void* send_val_buf,
-                                                    size_t send_val_count,
-                                                    void* recv_ind_buf,
-                                                    size_t recv_ind_count,
-                                                    void* recv_val_buf,
-                                                    size_t recv_val_count,
-                                                    ccl::datatype index_dtype,
-                                                    ccl::datatype value_dtype,
-                                                    ccl::reduction reduction,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::sparse_allreduce_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req = ccl_sparse_allreduce_impl(send_ind_buf,
-                                                 send_ind_count,
-                                                 send_val_buf,
-                                                 send_val_count,
-                                                 recv_ind_buf,
-                                                 recv_ind_count,
-                                                 recv_val_buf,
-                                                 recv_val_count,
-                                                 index_dtype,
-                                                 value_dtype,
-                                                 reduction,
-                                                 attr,
-                                                 comm_impl.get(),
-                                                 get_stream_ptr(stream),
-                                                 deps);
-
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
-}
-
-std::shared_ptr<atl_wrapper> host_communicator::get_atl() {
-    return comm_impl->atl;
-}
-
-std::shared_ptr<host_communicator> host_communicator::get_r2r_comm() {
-    return r2r_comm;
-}
-
-std::shared_ptr<host_communicator> host_communicator::get_node_comm() {
-    return node_comm;
-}
-
-std::shared_ptr<host_communicator> host_communicator::get_pair_comm() {
-    return pair_comm;
-}
-
-std::shared_ptr<host_communicator> host_communicator::get_even_comm() {
-    return even_comm;
-}
-
-std::shared_ptr<ccl_comm> host_communicator::get_ccl_comm() {
-    return comm_impl;
-}
-
-std::string host_communicator::to_string() const {
-    return std::string("host communicator, rank (") + std::to_string(rank()) + "/" +
-           std::to_string(size());
-}
-
-COMM_INTERFACE_COLL_INSTANTIATION(host_communicator);
-#ifdef CCL_ENABLE_SYCL
-SYCL_COMM_INTERFACE_COLL_INSTANTIATION(host_communicator);
-#endif // CCL_ENABLE_SYCL
-
-} // namespace ccl
diff --git a/src/common/comm/host_communicator/host_communicator.hpp b/src/common/comm/host_communicator/host_communicator.hpp
deleted file mode 100644
index 53bb642ce..000000000
--- a/src/common/comm/host_communicator/host_communicator.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "atl/atl_wrapper.h"
-#include "common/comm/comm.hpp"
-#include "common/stream/stream.hpp"
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/types_policy.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
-#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/comm_split_attr.hpp"
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/type_traits.hpp"
-#include "oneapi/ccl/types_policy.hpp"
-#include "oneapi/ccl/event.hpp"
-#include "oneapi/ccl/coll_attr_ids.hpp"
-#include "oneapi/ccl/coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/coll_attr.hpp"
-
-#include "common/comm/communicator_traits.hpp"
-#include "common/comm/comm_interface.hpp"
-#include "types_generator_defines.hpp"
-
-class ikvs_wrapper;
-namespace ccl {
-
-inline ccl_stream* get_stream_ptr(const ccl::stream::impl_value_t& stream) {
-    if (stream.get() && stream->is_sycl_device_stream())
-        return stream.get();
-    else
-        return nullptr;
-}
-
-class host_communicator : public ccl::communicator_interface {
-public:
-    using traits = ccl::host_communicator_traits;
-
-    int rank() const override;
-    int size() const override;
-
-    // traits
-    bool is_host() const noexcept override {
-        return traits::is_host();
-    }
-
-    bool is_cpu() const noexcept override {
-        return traits::is_cpu();
-    }
-
-    bool is_gpu() const noexcept override {
-        return traits::is_gpu();
-    }
-
-    bool is_accelerator() const noexcept override {
-        return traits::is_accelerator();
-    }
-
-    bool is_ready() const override {
-        return true;
-    }
-
-    const ccl::group_unique_key& get_comm_group_id() const override {
-        return owner_id;
-    }
-
-    void set_comm_group_id(ccl::group_unique_key id) {
-        owner_id = id;
-    }
-
-#ifdef MULTI_GPU_SUPPORT
-    void visit(ccl::gpu_comm_attr& comm_attr) override;
-#endif
-
-    ccl::device_index_type get_device_path() const override;
-    ccl::communicator_interface::device_t get_device() const override;
-    ccl::communicator_interface::context_t get_context() const override;
-
-    const ccl::comm_split_attr& get_comm_split_attr() const override {
-        return comm_attr;
-    }
-
-    ccl::group_split_type get_topology_type() const override {
-        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " +
-                             traits::name());
-        return ccl::group_split_type::undetermined;
-    }
-
-    ccl::device_topology_type get_topology_class() const override {
-        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " +
-                             traits::name());
-        return ccl::device_topology_type::undetermined;
-    }
-
-    ccl::communicator_interface_ptr split(const comm_split_attr& attr) override;
-
-    // collectives operation declarations
-    ccl::event barrier(const stream::impl_value_t& op_stream,
-                       const barrier_attr& attr,
-                       const vector_class<event>& deps = {}) override;
-    ccl::event barrier_impl(const stream::impl_value_t& op_stream,
-                            const barrier_attr& attr,
-                            const vector_class<event>& deps = {});
-
-    COMM_INTERFACE_COLL_METHODS(DEFINITION);
-#ifdef CCL_ENABLE_SYCL
-    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
-#endif // CCL_ENABLE_SYCL
-
-    COMM_IMPL_DECLARATION;
-    COMM_IMPL_CLASS_DECLARATION
-    COMM_IMPL_SPARSE_DECLARATION;
-    COMM_IMPL_SPARSE_CLASS_DECLARATION
-
-    host_communicator();
-    host_communicator(int size, shared_ptr_class<ikvs_wrapper> kvs);
-    host_communicator(int size, int rank, shared_ptr_class<ikvs_wrapper> kvs);
-    host_communicator(ccl::unified_device_type&& device,
-                      ccl::unified_context_type&& context,
-                      std::shared_ptr<atl_wrapper> atl);
-    host_communicator(std::shared_ptr<atl_wrapper> atl);
-    host_communicator(std::shared_ptr<ccl_comm> impl, bool is_sub_communicator = false);
-    host_communicator(host_communicator& src) = delete;
-    host_communicator(host_communicator&& src) = default;
-    host_communicator& operator=(host_communicator& src) = delete;
-    host_communicator& operator=(host_communicator&& src) = default;
-    ~host_communicator() = default;
-    std::shared_ptr<atl_wrapper> get_atl();
-    std::shared_ptr<host_communicator> get_r2r_comm();
-    std::shared_ptr<host_communicator> get_node_comm();
-    std::shared_ptr<host_communicator> get_even_comm();
-    std::shared_ptr<host_communicator> get_pair_comm();
-    std::shared_ptr<ccl_comm> get_ccl_comm();
-
-    // troubleshooting
-    std::string to_string() const;
-
-private:
-    friend struct group_context;
-
-    std::shared_ptr<ccl_comm> comm_impl;
-
-    ccl::unified_device_type device;
-    //ccl::unified_context_type context;
-
-    std::shared_ptr<host_communicator> r2r_comm;
-    std::shared_ptr<host_communicator> node_comm;
-    std::shared_ptr<host_communicator> even_comm;
-    std::shared_ptr<host_communicator> pair_comm;
-    ccl::comm_split_attr comm_attr;
-    int comm_rank;
-    int comm_size;
-    ccl::group_unique_key owner_id;
-
-    host_communicator* get_impl() {
-        return this;
-    }
-
-    void exchange_colors(std::vector<int>& colors);
-    void create_sub_comms(std::shared_ptr<atl_wrapper> atl);
-    ccl_comm* create_with_color(int color,
-                                ccl_comm_id_storage* comm_ids,
-                                const ccl_comm* parent_comm);
-}; // class host_communicator
-
-} // namespace ccl
diff --git a/src/common/comm/l0/base_connector.hpp b/src/common/comm/l0/base_connector.hpp
deleted file mode 100644
index e3f6ebda9..000000000
--- a/src/common/comm/l0/base_connector.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-template <class visitor_to_connect>
-struct base_connector_interface {
-    using visitor = visitor_to_connect;
-
-    virtual ~base_connector_interface() noexcept = default;
-    virtual bool operator()(visitor_to_connect& to_connect) = 0;
-};
diff --git a/src/common/comm/l0/comm_context.cpp b/src/common/comm/l0/comm_context.cpp
deleted file mode 100644
index 196079c12..000000000
--- a/src/common/comm/l0/comm_context.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "oneapi/ccl/aliases.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/comm_context_impl.hpp"
-#include "common/utils/spinlock.hpp"
-#include "common/comm/atl_tag.hpp"
-
-namespace ccl {
-comm_group::comm_group(shared_communicator_t parent_comm,
-                       size_t threads_per_process,
-                       size_t ranks_per_process,
-                       group_unique_key id)
-        : pimpl(new gpu_comm_attr(parent_comm, threads_per_process, ranks_per_process, id)){};
-
-bool comm_group::sync_group_size(size_t device_group_size) {
-    return pimpl->sync_group_size(device_group_size);
-}
-
-comm_group::~comm_group() {}
-
-const group_unique_key& comm_group::get_unique_id() const {
-    return pimpl->get_unique_id();
-}
-/*
-std::string comm_group::to_string() const
-{
-    pimpl->to_string();
-}*/
-} // namespace ccl
-// container-based method force-instantiation will trigger ALL other methods instantiations
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::vector_class<ccl::device_index_type>,
-                                             typename ccl::unified_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::list_class<ccl::device_index_type>,
-                                             typename ccl::unified_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::device_indices_type,
-                                             typename ccl::unified_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(ccl::device_index_type,
-                                        typename ccl::unified_context_type::ccl_native_t);
-
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(
-    ccl::vector_class<typename ccl::unified_device_type::ccl_native_t>,
-    typename ccl::unified_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(typename ccl::unified_device_type::ccl_native_t,
-                                        typename ccl::unified_context_type::ccl_native_t);
diff --git a/src/common/comm/l0/comm_context.hpp b/src/common/comm/l0/comm_context.hpp
deleted file mode 100644
index 070e75ab9..000000000
--- a/src/common/comm/l0/comm_context.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "oneapi/ccl/aliases.hpp"
-#include "oneapi/ccl/device_types.hpp"
-#include "oneapi/ccl/type_traits.hpp"
-#include "oneapi/ccl/types_policy.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
-#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/comm_split_attr.hpp"
-
-#include "oneapi/ccl/coll_attr_ids.hpp"
-#include "oneapi/ccl/coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/coll_attr.hpp"
-
-#include "oneapi/ccl/stream_attr_ids.hpp"
-#include "oneapi/ccl/stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/stream.hpp"
-
-#include "oneapi/ccl/event.hpp"
-#include "oneapi/ccl/communicator.hpp"
-
-#include "common/comm/l0/comm_context_id.hpp"
-#include "common/comm/comm_interface.hpp"
-
-namespace ccl {
-namespace detail {
-class environment;
-}
-
-class host_communicator;
-struct gpu_comm_attr;
-using shared_communicator_t = std::shared_ptr<host_communicator>;
-
-class comm_group {
-public:
-    friend class ccl::detail::environment;
-    friend struct group_context;
-
-    using context_t = typename unified_context_type::ccl_native_t;
-
-    ~comm_group();
-    /**
-     * Device Communicator creation API: single communicator creation, based on @device
-     */
-    template <class DeviceType,
-              class ContextType,
-              typename std::enable_if<not std::is_same<typename std::remove_cv<DeviceType>::type,
-                                                       ccl::device_index_type>::value,
-                                      int>::type = 0>
-    ccl::communicator_interface_ptr create_communicator_from_group(
-        const DeviceType& device,
-        const ContextType& context,
-        const comm_split_attr& attr = ccl_empty_attr());
-
-    /**
-     * Device Communicator creation API: single communicator creation, based on index @device_id
-     */
-    template <class DeviceType,
-              class ContextType,
-              typename std::enable_if<std::is_same<typename std::remove_cv<DeviceType>::type,
-                                                   ccl::device_index_type>::value,
-                                      int>::type = 0>
-    ccl::communicator_interface_ptr create_communicator_from_group(
-        const DeviceType& device_id,
-        const ContextType& context,
-        const comm_split_attr& attr = ccl_empty_attr());
-
-    /**
-     * Device Communicator creation vectorized API:
-     * multiple communicator creation, based on devices iterator @InputIt
-     */
-    template <class InputIt, class ContextType>
-    std::vector<communicator> create_communicators_group(InputIt first,
-                                                         InputIt last,
-                                                         const ContextType& context,
-                                                         comm_split_attr attr = ccl_empty_attr());
-
-    /**
-     * Device Communicator creation vectorized API:
-     * multiple communicator creation, based on devices of @Type, packed into container @Container
-     */
-    template <template <class...> class Container, class Type, class ContextType>
-    std::vector<communicator> create_communicators_group(const Container<Type>& device_ids,
-                                                         const ContextType& context,
-                                                         comm_split_attr attr = ccl_empty_attr());
-
-    /**
-     * Return device context allocated during group creation
-     */
-    //context_native_const_reference_t get_context() const;
-
-    bool sync_group_size(size_t device_group_size);
-    /*
-    std::string to_string() const;
-*/
-    const group_unique_key& get_unique_id() const;
-
-private:
-    comm_group(ccl::shared_communicator_t comm,
-               size_t current_device_group_size,
-               size_t process_device_group_size,
-               group_unique_key id);
-    std::unique_ptr<gpu_comm_attr> pimpl;
-};
-} // namespace ccl
diff --git a/src/common/comm/l0/comm_context_id.hpp b/src/common/comm/l0/comm_context_id.hpp
deleted file mode 100644
index 18de22b04..000000000
--- a/src/common/comm/l0/comm_context_id.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/atl_tag.hpp"
-
-namespace ccl {
-using group_unique_key = ccl_comm_id_t;
-}
diff --git a/src/common/comm/l0/comm_context_impl.hpp b/src/common/comm/l0/comm_context_impl.hpp
deleted file mode 100644
index f35795bd1..000000000
--- a/src/common/comm/l0/comm_context_impl.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/kvs.hpp"
-#include "common/log/log.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/comm_context.hpp"
-#include "common/global/global.hpp"
-
-namespace ccl {
-
-/*
- *  Single device communicator creation
- */
-template <class DeviceType,
-          class ContextType,
-          typename std::enable_if<not std::is_same<typename std::remove_cv<DeviceType>::type,
-                                                   ccl::device_index_type>::value,
-                                  int>::type>
-ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group(
-    const DeviceType& device,
-    const ContextType& context,
-    const ccl::comm_split_attr& attr /* = comm_device_attr_t()*/) {
-#ifdef CCL_ENABLE_SYCL
-    static_assert(
-        std::is_same<DeviceType, cl::sycl::device>::value,
-        "ccl::comm_group::create_communicator_from_group() - supports SYCL devices at now");
-#endif
-
-    ccl::communicator_interface_ptr impl;
-    //TODO -S- temporary solution to support single device case
-    auto device_count_per_process = pimpl->get_expected_process_device_size();
-    LOG_DEBUG("create communicator from device, expected devices per process: ",
-              device_count_per_process);
-
-    auto host_comm = pimpl->get_host_communicator();
-
-    if (device_count_per_process == 1 /*&& !ccl::global_data::env().enable_comm_kernels*/) {
-        /* special single device case */
-        LOG_TRACE("create single device communicator from SYCL device");
-        //TODO
-        impl = ccl::communicator_interface::create_communicator_impl(device,
-                                                                     context,
-                                                                     host_comm->rank(),
-                                                                     host_comm->size(),
-                                                                     attr,
-                                                                     host_comm->get_atl(),
-                                                                     ccl::group_split_type::single);
-    }
-    else {
-        // multiple device case
-        impl = ccl::communicator_interface::create_communicator_impl(
-            device, context, pimpl->thread_id, host_comm->rank(), attr, host_comm->get_atl());
-
-        // registering device in group - is non blocking operation, until it is not the last device
-        pimpl->sync_register_communicator(impl);
-    }
-    return impl;
-}
-
-template <class DeviceType,
-          class ContextType,
-          typename std::enable_if<std::is_same<typename std::remove_cv<DeviceType>::type,
-                                               ccl::device_index_type>::value,
-                                  int>::type>
-ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group(
-    const DeviceType& device_id,
-    const ContextType& context,
-    const ccl::comm_split_attr& attr /* = nullptr*/) {
-    LOG_TRACE("create communicator from id: ", device_id);
-    auto host_comm = pimpl->get_host_communicator();
-
-    ccl::communicator_interface_ptr impl = ccl::communicator_interface::create_communicator_impl(
-        device_id, context, pimpl->thread_id, host_comm->rank(), attr, host_comm->get_atl());
-    // registering device in group - is non blocking operation, until it is not the last device
-    pimpl->sync_register_communicator(impl);
-    return impl;
-}
-
-/**
- *  Multiple device communicators creation vectorized API implementation
- */
-template <class InputIt, class ContextType>
-std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
-    InputIt first,
-    InputIt last,
-    const ContextType& context,
-    ccl::comm_split_attr attr /* = nullptr*/) {
-    /*
-    static_assert(not std::is_same<InputIt, typename ccl::vector_class<cl::sycl::device>::const_iterator>::value, "SYCL");
-*/
-    using iterator_value_type = typename std::iterator_traits<InputIt>::value_type;
-    /*
-    using expected_value_type = typename unified_device_type::device_t;
-    static_assert(std::is_same<iterator_value_type, expected_value_type>::value,
-                  "Not valid InputIt in create_communicators");
-*/
-    size_t indices_count = std::distance(first, last);
-    LOG_TRACE("create device communicators from index iterators type, count: ", indices_count);
-
-    std::vector<ccl::communicator> comms;
-    comms.reserve(indices_count);
-    std::transform(first,
-                   last,
-                   std::back_inserter(comms),
-                   [this, attr, &context](const iterator_value_type& device_id) {
-                       return ccl::communicator(
-                           create_communicator_from_group<iterator_value_type, ContextType>(
-                               device_id, context, attr));
-                   });
-    return comms;
-}
-
-template <template <class...> class Container, class Type, class ContextType>
-std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
-    const Container<Type>& device_ids,
-    const ContextType& context,
-    ccl::comm_split_attr attr /* = nullptr*/) {
-    //static_assert(not std::is_same<Type, cl::sycl::device>::value, "SYCL cont");
-    //static_assert(std::is_same<Type, ccl::device_index_type>::value, "Invalid Type in create_communicators");
-    LOG_TRACE("create device communicators from index type, count: ",
-              device_ids.size(),
-              ", redirect to iterators version");
-    return this->create_communicators_group<typename Container<Type>::const_iterator, ContextType>(
-        device_ids.begin(), device_ids.end(), context, attr);
-}
-/*
- ccl::comm_group::context_native_const_reference_t ccl::comm_group::get_context() const
-{
-    //TODO use PIMPL as context provider
-    static unified_context_type context;
-    return context.get();
-}
-*/
-} // namespace ccl
-
-/***************************************************************************************************/
-#define COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(type, context_type) \
-    template ccl::vector_class<ccl::communicator> ccl::comm_group::create_communicators_group( \
-        const type& devices, const context_type& ctx, ccl::comm_split_attr attr);
-
-#define COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(type, context_type) \
-    template ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group( \
-        const type& device, const context_type& context, const ccl::comm_split_attr& attr);
diff --git a/src/common/comm/l0/comm_context_storage.cpp b/src/common/comm/l0/comm_context_storage.cpp
deleted file mode 100644
index 1c31b1e4a..000000000
--- a/src/common/comm/l0/comm_context_storage.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-#include "common/comm/comm.hpp"
-#include "common/comm/l0/comm_context.hpp"
-#include "common/comm/l0/comm_context_storage.hpp"
-
-#include "common/global/global.hpp"
-
-namespace ccl {
-group_context& group_context::instance() {
-    static group_context inst;
-    return inst;
-}
-
-group_context::comm_group_t group_context::group_by_kvs(
-    const std::vector<int>& local_thread_device_group_ranks,
-    int cluster_device_group_size,
-    std::shared_ptr<ikvs_wrapper> kvs) {
-    LOG_DEBUG("thread acquire by barrier");
-    std::shared_ptr<atl_wrapper> atl = std::shared_ptr<atl_wrapper>(
-        new atl_wrapper(cluster_device_group_size, local_thread_device_group_ranks, kvs));
-
-    /* Indicate that multiple devices are not supported, don't throw anything if enable_comm_kernels=0
-     * to enable our testing with partial functionality.
-     * Most of the cases are handled in communicator_impl_details.hpp, but here we check the case
-     * when we have multiple threads and each of them has 1 device. And we don't know the total number
-     * of ranks in the process until we sync them above */
-    if (atl->get_ranks_per_process() > 1 /* && !ccl::global_data::env().enable_comm_kernels*/) {
-        throw ccl::unimplemented("API", "create_communicators", "for multiple devices");
-    }
-
-    LOG_DEBUG("thread released by barrier");
-    LOG_DEBUG("cluster_device_group size: ",
-              cluster_device_group_size,
-              "\nthread device group ranks size: ",
-              local_thread_device_group_ranks.size());
-    for (size_t i = 0; i < local_thread_device_group_ranks.size(); i++) {
-        LOG_DEBUG("\nlocal thread device group ranks: ", local_thread_device_group_ranks[i]);
-    }
-
-    // register group slot in global context table, based on communicator id
-    comm_group_t group = group_context::group_by_comm(atl);
-
-    // if (ccl::global_data::env().enable_comm_kernels) {
-    //     // sync existing group: blocking operation - wait for all groups
-    //     LOG_DEBUG("group thread barrier acquired: ", static_cast<void*>(group.get()));
-    //     group->sync_group_size(local_thread_device_group_ranks.size());
-    //     LOG_DEBUG("group thread barrier released: ", static_cast<void*>(group.get()));
-    // }
-
-    return group;
-}
-
-group_context::comm_group_t group_context::group_by_comm(std::shared_ptr<atl_wrapper> atl) {
-    std::stringstream ss;
-    ss << "\n{\n"
-       << "  ATL info:\n"
-       << "    rank: " << atl->get_rank() << "\n"
-       << "    size: " << atl->get_size() << "\n"
-       << "    id: " << atl->get_id() << "\n"
-       << "    ranks per process: " << atl->get_ranks_per_process() << "\n"
-       << "    threads per process: " << atl->get_threads_per_process() << "\n"
-       << "}";
-    LOG_INFO(ss.str());
-
-    comm_group_t group;
-    {
-        // mutex
-        std::unique_lock<ccl_spinlock> lock(mutex);
-        size_t threads_per_process = atl->get_threads_per_process();
-        size_t ranks_per_process = atl->get_ranks_per_process();
-        group_context::group_unique_key unique_id = atl->get_id();
-
-        auto ctx_it = communicator_group_map.find(unique_id);
-        if (ctx_it == communicator_group_map.end()) {
-            std::shared_ptr<host_communicator> host_comm = std::make_shared<host_communicator>(atl);
-            group.reset(
-                new ccl::comm_group(host_comm, threads_per_process, ranks_per_process, unique_id));
-            communicator_group_map.insert({ unique_id, group });
-            LOG_DEBUG("comm group: ",
-                      static_cast<void*>(group.get()),
-                      " has been created for unique_id: ",
-                      unique_id,
-                      ", threads per process: ",
-                      threads_per_process,
-                      ", ranks per process: ",
-                      ranks_per_process);
-        }
-        else {
-            group = ctx_it->second;
-            LOG_DEBUG("get existing comm group: ",
-                      static_cast<void*>(group.get()),
-                      " for unique_id: ",
-                      unique_id);
-        }
-    }
-    return group;
-}
-
-group_context::comm_group_t group_context::get_existing_group_by_id(
-    const group_unique_key& unique_id) {
-    comm_group_t group;
-    LOG_DEBUG("get existing comm group by id: ",
-              unique_id,
-              ", total groups: ",
-              communicator_group_map.size());
-    {
-        std::unique_lock<ccl_spinlock> lock(mutex);
-        auto ctx_it = communicator_group_map.find(unique_id);
-        if (ctx_it == communicator_group_map.end()) {
-            std::stringstream ss;
-            ss << "Cannot find `comm_group_t` by id: " << unique_id << std::endl;
-            const std::string mess = ss.str();
-            LOG_ERROR(mess);
-            throw ccl::exception(std::string(__FUNCTION__) + " - " + mess);
-        }
-        else {
-            group = ctx_it->second;
-            LOG_DEBUG("get existing comm group: ",
-                      static_cast<void*>(group.get()),
-                      " for unique_id: ",
-                      unique_id);
-        }
-    }
-    return group;
-}
-} // namespace ccl
diff --git a/src/common/comm/l0/comm_context_storage.hpp b/src/common/comm/l0/comm_context_storage.hpp
deleted file mode 100644
index 0ae775677..000000000
--- a/src/common/comm/l0/comm_context_storage.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-
-#include "common/utils/spinlock.hpp"
-#include "common/comm/atl_tag.hpp"
-#include "atl/atl_wrapper.h"
-
-namespace ccl {
-namespace v1 {
-class kvs_interface;
-}
-
-class host_communicator;
-class comm_group;
-
-struct group_context {
-    /* TODO
-     * In multithreading scenario we use different comm_group_t objects in different threads.
-     * But we need to match different groups created for the same world in different threads
-     * The assumption is done: if different groups created from the same communicator color, than they
-     * should be interpreted as the same groups in the same world.
-     *
-     *
-     * In the final solution the 'group_unique_key' should be equal to unique KVS idenditifier
-     */
-    //    using group_unique_key = typename ccl::ccl_host_attributes_traits<ccl_host_color>::type;
-    using group_unique_key = ccl_comm_id_t;
-    using comm_group_t = std::shared_ptr<comm_group>;
-    std::map<group_unique_key, comm_group_t> communicator_group_map;
-    ccl_spinlock mutex;
-
-    comm_group_t group_by_kvs(const std::vector<int>& local_thread_device_group_ranks,
-                              int cluster_device_group_size,
-                              std::shared_ptr<ikvs_wrapper> kvs);
-    comm_group_t group_by_comm(std::shared_ptr<atl_wrapper> atl);
-    comm_group_t get_existing_group_by_id(const group_unique_key& id);
-    static group_context& instance();
-
-private:
-    group_context() = default;
-    group_context(group_context&& src) = delete;
-    group_context& operator=(group_context&& src) = delete;
-};
-} // namespace ccl
diff --git a/src/common/comm/l0/communicator/base_communicator.hpp b/src/common/comm/l0/communicator/base_communicator.hpp
deleted file mode 100644
index ef315d4a1..000000000
--- a/src/common/comm/l0/communicator/base_communicator.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <mutex>
-
-#include "common/comm/comm_interface.hpp"
-//TODO #include "sched/gpu_sched.hpp"
-#include "common/comm/l0/comm_context_id.hpp"
-
-struct base_communicator : public ccl::communicator_interface {
-    //TODO using group_comm_storage = native::specific_indexed_device_storage;
-
-    base_communicator(ccl::unified_device_type&& owned_device,
-                      ccl::unified_context_type&& owned_ctx,
-                      size_t thread_idx,
-                      size_t process_idx,
-                      const ccl::comm_split_attr& attr)
-            : device(std::move(owned_device)),
-              context(std::move(owned_ctx)),
-              thread_id(thread_idx),
-              process_id(process_idx),
-              comm_attr(attr),
-              comm_rank(),
-              comm_size(),
-              ready_mutex() /*,
-        devices(nullptr)*/
-    {}
-
-    virtual ~base_communicator() = default;
-
-    int rank() const override {
-        return comm_rank;
-    }
-
-    int size() const override {
-        return comm_size;
-    }
-
-    ccl::device_index_type get_device_path() const override {
-        return device.get_id();
-    }
-
-    ccl::communicator_interface::device_t get_device() const override {
-        return device.get();
-    }
-
-    ccl::communicator_interface::context_t get_context() const override {
-        return context.get();
-    }
-
-    const ccl::comm_split_attr& get_comm_split_attr() const override {
-        return comm_attr;
-    }
-
-    const ccl::group_unique_key& get_comm_group_id() const override {
-        return owner_id;
-    }
-
-    void set_comm_group_id(ccl::group_unique_key id) {
-        owner_id = id;
-    }
-    /*
-    virtual bool is_ready() const
-    {
-        if(!devices)
-        {
-            std::unique_lock<ccl_spinlock> lock(ready_mutex);
-            return devices;
-        }
-        return true;
-    }
-*/
-    ccl::unified_device_type device;
-    ccl::unified_context_type context;
-    size_t thread_id;
-    size_t process_id;
-    const ccl::comm_split_attr comm_attr;
-
-    //TODO add context_comm_addr to aggregate device_id,thread_id, process_id & ranks
-    int comm_rank;
-    int comm_size;
-
-    mutable ccl_spinlock ready_mutex;
-
-    ccl::group_unique_key owner_id;
-};
diff --git a/src/common/comm/l0/communicator/typed_base_communicator.hpp b/src/common/comm/l0/communicator/typed_base_communicator.hpp
deleted file mode 100644
index 4bed58c10..000000000
--- a/src/common/comm/l0/communicator/typed_base_communicator.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/communicator/base_communicator.hpp"
-#include "common/comm/l0/device_community_holder.hpp"
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-
-template <class comm_impl,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class communicator_traits>
-class typed_base_communicator : public base_communicator {
-public:
-    using base_t = base_communicator;
-    using impl_t = comm_impl;
-    using self_t = typed_base_communicator<comm_impl, group_id, class_id, communicator_traits>;
-    using traits = communicator_traits;
-
-    // Topologies
-    static constexpr ccl::group_split_type topology_type() {
-        return group_id;
-    }
-
-    static constexpr ccl::device_topology_type topology_class() {
-        return class_id;
-    }
-
-    // traits
-    bool is_host() const noexcept override {
-        return traits::is_host();
-    }
-
-    bool is_cpu() const noexcept override {
-        return traits::is_cpu();
-    }
-
-    bool is_gpu() const noexcept override {
-        return traits::is_gpu();
-    }
-
-    bool is_accelerator() const noexcept override {
-        return traits::is_accelerator();
-    }
-
-    ccl::communicator_interface_ptr split(const ccl::comm_split_attr& attr) override;
-
-    typed_base_communicator(ccl::unified_device_type&& device,
-                            ccl::unified_context_type&& ctx,
-                            size_t thread_idx,
-                            size_t process_idx,
-                            const ccl::comm_split_attr& attr);
-
-    ccl::group_split_type get_topology_type() const override;
-    ccl::device_topology_type get_topology_class() const override;
-
-    void initialize_comm_addr(const ccl::device_index_type& device_id,
-                              native::device_community_container<class_id>& community);
-
-    bool is_ready() const override;
-
-    native::ccl_driver_context_ptr get_native_context() {
-        return native::get_runtime_context(context.get());
-    }
-
-    COMM_INTERFACE_COLL_METHODS(DEFINITION);
-#ifdef CCL_ENABLE_SYCL
-    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
-#endif // CCL_ENABLE_SYCL
-
-    // Device community interface
-    /*    template<class device_t>
-    size_t get_device_count() const;
-
-    template<class device_t>
-    native::indexed_device_container<device_t>& get_devices();
-*/
-    // troubleshooting
-    std::string to_string() const;
-
-    native::device_community_container<class_id> device_community_impl;
-
-    impl_t* get_impl() {
-        return static_cast<impl_t*>(this);
-    }
-
-    /*
-    native::binder_t bind_device;
-    native::binder_t& get_communication_device()
-    {
-        return bind_device;
-    }*/
-};
diff --git a/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp b/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp
deleted file mode 100644
index 1f9d9e0fc..000000000
--- a/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/type_traits.hpp"
-#include "common/comm/l0/communicator/typed_base_communicator.hpp"
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/context/process_group_ctx.hpp"
-#include "common/comm/l0/comm_context_storage.hpp"
-#include "common/comm/l0/comm_context.hpp"
-
-#define TEMPLATE_DECL_ARG \
-    class comm_impl, ccl::group_split_type topology, ccl::device_topology_type class_id, \
-        class communicator_traits
-#define TEMPLATE_DEF_ARG comm_impl, topology, class_id, communicator_traits
-
-template <TEMPLATE_DECL_ARG>
-typed_base_communicator<TEMPLATE_DEF_ARG>::typed_base_communicator(
-    ccl::unified_device_type&& owned_device,
-    ccl::unified_context_type&& owned_ctx,
-    size_t thread_idx,
-    size_t process_idx,
-    const ccl::comm_split_attr& attr)
-        : base_communicator(std::move(owned_device),
-                            std::move(owned_ctx),
-                            thread_idx,
-                            process_idx /*, comm_attr*/,
-                            attr) {
-    try {
-        LOG_DEBUG("sheduled for create, device id: ",
-                  device.get_id(),
-                  ", thread_id: ",
-                  thread_idx,
-                  ", process id:",
-                  process_idx);
-    }
-    catch (...) {
-        LOG_DEBUG("sheduled for create single device communicator , thread_id: ",
-                  thread_idx,
-                  ", process id:",
-                  process_idx);
-    }
-}
-
-template <TEMPLATE_DECL_ARG>
-void typed_base_communicator<TEMPLATE_DEF_ARG>::initialize_comm_addr(
-    const ccl::device_index_type& device_id,
-    native::device_community_container<class_id>& new_community) {
-    // Iterate over community container, find device and assing rank, size from topology.
-    // Lets register woned deive in each toplogy, but return as PUBLIC the only onw
-    // It is not matter, what speficic topology use select here for PUBLIC rank & size
-    // for rank assigning in auto-ranking mode. SO, use clsoed ring t first, then goesn into torn_apart
-
-    ccl::context_comm_addr registered_addr;
-    native::detail::printer<topology_type(), topology_class()> p;
-    {
-        std::unique_lock<ccl_spinlock> lock(ready_mutex);
-
-        auto& binder = get_impl()->get_communication_device();
-        device_community_impl = new_community;
-        device_community_impl.template bind_device_by_id<topology_type()>(
-            device_id, registered_addr, binder
-            /*TODO PUT your preferred rank here*/);
-        // print assigned device from topology
-        ccl_tuple_for_each(binder, p);
-    }
-
-    //TODO multiple topologies in curr class_id
-    comm_rank = registered_addr.comm_rank;
-    comm_size = registered_addr.comm_size;
-
-    LOG_DEBUG("Communicator finalized. Rank (",
-              comm_rank,
-              "/",
-              comm_size,
-              ") on {dev: ",
-              device_id,
-              ", thr: ",
-              thread_id,
-              ", proc: ",
-              process_id,
-              "} on device:\n",
-              p.to_string());
-}
-
-template <TEMPLATE_DECL_ARG>
-bool typed_base_communicator<TEMPLATE_DEF_ARG>::is_ready() const {
-    /* TODO!!!!
-    if(!device_community_impl.get())
-    {
-        std::unique_lock<ccl_spinlock> lock(ready_mutex);
-        return device_community_impl.get();
-    }
-    */
-    return true;
-}
-
-template <TEMPLATE_DECL_ARG>
-ccl::group_split_type typed_base_communicator<TEMPLATE_DEF_ARG>::get_topology_type() const {
-    return self_t::topology_type();
-}
-
-template <TEMPLATE_DECL_ARG>
-ccl::device_topology_type typed_base_communicator<TEMPLATE_DEF_ARG>::get_topology_class() const {
-    return self_t::topology_class();
-}
-/*
-template<TEMPLATE_DECL_ARG>
-template<class device_t>
-size_t typed_base_communicator<TEMPLATE_DEF_ARG>::get_device_count() const
-{
-    return ccl_tuple_get<native::indexed_device_container<device_t>>(device_community_impl->get_device_storage()).size();
-}
-
-template<TEMPLATE_DECL_ARG>
-template<class device_t>
-native::indexed_device_container<device_t>& typed_base_communicator<TEMPLATE_DEF_ARG>::get_devices()
-{
-    return std::get<device_t::type_idx()>(device_community_impl->get_device_storage());
-}
-*/
-template <TEMPLATE_DECL_ARG>
-std::string typed_base_communicator<TEMPLATE_DEF_ARG>::to_string() const {
-    native::detail::printer<self_t::topology_type(), self_t::topology_class()> p;
-    ccl_tuple_for_each(device_community_impl->get_device_storage(), p);
-    return std::string("Rank (") + std::to_string(rank()) + "/" + std::to_string(size()) +
-           "\nGroup id: " + ::to_string(self_t::topology_type()) +
-           "\nClassId: " + ::to_string(self_t::topology_class()) + ":\n" + p.to_string();
-}
-
-template <TEMPLATE_DECL_ARG>
-ccl::communicator_interface_ptr typed_base_communicator<TEMPLATE_DEF_ARG>::split(
-    const ccl::comm_split_attr& attr) {
-    if (!attr.is_valid<ccl::comm_split_attr_id::group>()) {
-        throw ccl::exception(std::string(__FUNCTION__) +
-                             " - TODO `comm_split_attr`: supports `group` only");
-    }
-//TODO
-#ifdef MULTI_GPU_SUPPORT
-    auto id = get_impl()->get_comm_group_id();
-    ccl::group_context::comm_group_t my_group =
-        ccl::group_context::instance().get_existing_group_by_id(id);
-#ifdef CCL_ENABLE_SYCL
-    auto ctx = get_impl()->get_context();
-    return my_group->create_communicator_from_group<cl::sycl::device>(get_device(), ctx, attr);
-#else
-#ifdef MULTI_GPU_SUPPORT
-    auto ctx = get_impl()->get_context();
-    return my_group->create_communicator_from_group(get_impl()->get_device_path(), ctx, attr);
-#endif
-#endif
-#else
-    throw ccl::exception(std::string(__FUNCTION__) + " - TODO `comm_split_attr`: unsupported");
-    return this;
-#endif
-}
-
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
diff --git a/src/common/comm/l0/context/base_ctx_actor.hpp b/src/common/comm/l0/context/base_ctx_actor.hpp
deleted file mode 100644
index d7425de3c..000000000
--- a/src/common/comm/l0/context/base_ctx_actor.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <list>
-#include <mutex>
-#include <unordered_map>
-#include <thread>
-#include "common/utils/spinlock.hpp"
-
-namespace native {
-namespace observer {
-
-template <class message_type>
-struct actor {
-    using message_value_type = message_type;
-    using storage_t = std::list<message_value_type>;
-    using key_t = size_t;
-    using core_t = std::function<void(storage_t& to_do_list)>;
-
-    template <class Function, class... Args>
-    actor(key_t actor_id, Function&& f, Args&&... args)
-            : function(std::bind(std::forward<Function>(f),
-                                 std::forward<Args>(args)...,
-                                 this,
-                                 std::placeholders::_1)),
-              stop(false),
-              processing(&actor<message_type>::run, this),
-              id(actor_id) {}
-
-    virtual ~actor() {
-        stop.store(true);
-        if (processing.joinable()) {
-            processing.join();
-        }
-    }
-
-    key_t get_id() const {
-        return id;
-    }
-
-    template <class typed_message_t>
-    void start_job(typed_message_t&& m) {
-        {
-            std::unique_lock<std::mutex> l(mutex);
-            messages.push_back(std::forward<typed_message_t>(m));
-            condition.notify_all();
-        }
-    }
-
-protected:
-    template <class Derived, class Function, class... Args>
-    actor(Derived* child, key_t actor_id, Function&& f, Args&&... args)
-            : function(std::bind(std::forward<Function>(f),
-                                 std::forward<Args>(args)...,
-                                 child,
-                                 std::placeholders::_1)),
-              stop(false),
-              processing(&actor<message_type>::run, this),
-              id(actor_id) {}
-
-private:
-    core_t function;
-    storage_t messages;
-    std::condition_variable condition;
-    std::mutex mutex;
-
-    std::atomic<bool> stop;
-    std::thread processing;
-    key_t id;
-
-    virtual void run() {
-        storage_t to_do_list;
-        while (!stop.load()) {
-            {
-                std::unique_lock<std::mutex> lk(mutex);
-                condition.wait(lk, [this]() {
-                    return !messages.empty();
-                });
-
-                to_do_list.splice(to_do_list.end(), messages);
-            }
-
-            function(to_do_list);
-        }
-    }
-};
-
-template <class message_type, class mailbox_message_type>
-struct subscribed_actor : public actor<message_type> {
-    using base_t = actor<message_type>;
-    using self_t = subscribed_actor<message_type, mailbox_message_type>;
-    using mailbox_message_t = mailbox_message_type;
-
-    struct mailbox_message_storage_t {
-        std::list<mailbox_message_t> container;
-        ccl_spinlock lock;
-        std::atomic<size_t> messages_count;
-    };
-
-    using recipient_storage_t = std::map<key_t, self_t*>;
-    using mailbox_table_t = std::unordered_map<key_t, std::unique_ptr<mailbox_message_storage_t>>;
-
-    template <class Function, class... Args>
-    subscribed_actor(key_t actor_id, Function&& f, Args&&... args)
-            : base_t(this, actor_id, std::forward<Function>(f), std::forward<Args>(args)...) {}
-
-    virtual ~subscribed_actor() {}
-
-    void subscribe_on(subscribed_actor<message_type, mailbox_message_t>* act) {
-        if (!act) {
-            return;
-        }
-
-        // rememeber as recipient
-        {
-            std::unique_lock<ccl_spinlock> lock(recipients_lock);
-            recipients[act->get_id()] = act;
-        }
-        act->subscribe_on(this);
-
-        // initialize message table
-        {
-            std::unique_lock<ccl_spinlock> lock(table_lock);
-            inner_message_table[act->get_id()].reset(new mailbox_message_storage_t);
-
-            // increase subscriptions count
-            subscriptions_table_size.fetch_add(1);
-        }
-    }
-
-    template <class... message_args>
-    void put_message(key_t sender_id, size_t topic_id, message_args&&... args) {
-        typename mailbox_table_t::iterator recipient_table_it;
-        {
-            std::unique_lock<ccl_spinlock> l(table_lock);
-            recipient_table_it = inner_message_table.find(sender_id);
-            if (recipient_table_it == inner_message_table.end()) {
-                throw std::runtime_error("Unregistered recipient");
-            }
-        }
-
-        // increase total messages count before
-        mailbox_message_counter.fetch_add(1);
-
-        std::unique_ptr<mailbox_message_storage_t>& mailbox = recipient_table_it->second;
-        (void)topic_id;
-        {
-            std::unique_lock<ccl_spinlock> l(mailbox->lock);
-            mailbox->container.emplace_back(std::forward<message_args>(args)...);
-
-            // increase actual sedner message count
-            mailbox->messages_count.fetch_add(1);
-        }
-    }
-
-    size_t get_subscriptions_count() const {
-        return subscriptions_table_size.load();
-    }
-
-    size_t get_mailbox_messages_count() const {
-        return mailbox_message_counter.load();
-    }
-
-    void get_mailbox_messages(key_t sender_id,
-                              size_t topic_id,
-                              std::list<mailbox_message_t>& messages) {
-        typename mailbox_table_t::iterator recipient_table_it;
-        {
-            std::unique_lock<ccl_spinlock> l(table_lock);
-            recipient_table_it = inner_message_table.find(sender_id);
-            if (recipient_table_it == inner_message_table.end()) {
-                throw std::runtime_error("Unregistered recipient");
-            }
-        }
-
-        std::unique_ptr<mailbox_message_storage_t>& mailbox = recipient_table_it->second;
-        (void)topic_id;
-        {
-            // check on message existence from sender
-            if (mailbox->messages_count.load()) {
-                std::unique_lock<ccl_spinlock> l(mailbox->lock);
-                mailbox->container.swap(messages);
-
-                // decreae total mesages count
-                size_t sender_messages_read_count = mailbox->messages_count.exchange(0);
-                mailbox_message_counter.fetch_sub(sender_messages_read_count);
-            }
-        }
-    }
-
-private:
-    recipient_storage_t recipients;
-    ccl_spinlock recipients_lock;
-
-    mailbox_table_t inner_message_table;
-    ccl_spinlock table_lock;
-    std::atomic<size_t> subscriptions_table_size;
-
-    std::atomic<size_t> mailbox_message_counter;
-};
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/base_scaling_ctx.hpp b/src/common/comm/l0/context/base_scaling_ctx.hpp
deleted file mode 100644
index d855c8219..000000000
--- a/src/common/comm/l0/context/base_scaling_ctx.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <stdexcept>
-#include <thread>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#include "common/comm/l0/devices/proxy_observer.hpp"
-#include "common/comm/l0/context/base_ctx_actor.hpp"
-
-namespace native {
-
-template <class device>
-class ccl_gpu_scaleup_proxy;
-
-namespace observer {
-
-template <class device_t, class actor_t>
-using device_thread_map = std::map<device_t*, std::unique_ptr<actor_t>>;
-
-template <class actor_t, class... devices_types>
-using multiple_device_thread_map_t = std::tuple<device_thread_map<devices_types, actor_t>...>;
-
-template <class device_t>
-using proxy_observer_ptr = typename std::add_pointer<device_t>::type;
-
-template <class device_t>
-using container_t = std::set<proxy_observer_ptr<device_t>>;
-
-template <class... device_t>
-using container_tuple_t = std::tuple<container_t<device_t>...>;
-
-template <class device_t>
-using indexed_container_t = std::map<size_t /* rank */, proxy_observer_ptr<device_t>>;
-
-template <class... device_t>
-using indexed_container_tuple_t = std::tuple<indexed_container_t<device_t>...>;
-
-// Static interface used to register proxy_observers
-template <class ctx_impl_t, class... proxy_observer_device_t>
-class base_scaling_ctx {
-public:
-    using own_t = base_scaling_ctx<ctx_impl_t, proxy_observer_device_t...>;
-
-    using device_types_t = std::tuple<proxy_observer_device_t...>;
-
-    template <ccl::device_topology_type class_id>
-    struct observables_types : container_tuple_t<proxy_observer_device_t...> {};
-
-    template <ccl::device_topology_type class_id>
-    struct indexed_observables_types : indexed_container_tuple_t<proxy_observer_device_t...> {};
-
-    template <ccl::device_topology_type... class_id>
-    using observable_topologies = std::tuple<observables_types<class_id>...>;
-
-    /* TODO use templated tepmlated container */
-    template <ccl::device_topology_type... class_id>
-    using indexed_observable_topologies = std::tuple<indexed_observables_types<class_id>...>;
-
-    template <class device_t>
-    static constexpr bool is_registered_device_t() {
-        return is_one_of<device_t, proxy_observer_device_t...>::value;
-    }
-
-    ctx_impl_t* get_this() {
-        return static_cast<ctx_impl_t*>(this);
-    }
-
-    const ctx_impl_t* get_this() const {
-        return static_cast<const ctx_impl_t*>(this);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class device_t>
-    void attach(device_t* obj) {
-        static_assert(std::is_base_of<proxy_observer<device_t>, device_t>::value,
-                      "Only `proxy_observer` derived class can be attached to context");
-
-        get_this()->attach_ctx_observer(
-            std::numeric_limits<size_t>::max(), /* unassigned addr at moment */
-            obj,
-            std::integral_constant<ccl::device_topology_type, class_id>{});
-    }
-
-    /* Workaround:
-     * topology constructor invoke `attach` straight toward after observer device creation
-     * But there are unassigneed rank addr in this case
-     * Rank will be assigned after indexer execution in topology constructor
-     * Need to remove `attach_ctx_observer` with  unassigned addr version and use assigning after indexer only
-     */
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class device_t>
-    void reattach_with_addr(size_t rank, device_t* obj) {
-        static_assert(std::is_base_of<proxy_observer<device_t>, device_t>::value,
-                      "Only `proxy_observer` derived class can be attached to context");
-
-        get_this()->attach_ctx_observer(
-            rank, obj, std::integral_constant<ccl::device_topology_type, class_id>{});
-    }
-
-    template <class device_t,
-              class = typename std::enable_if<is_registered_device_t<device_t>()>::type>
-    own_t* get_ctx_selector() {
-        return this;
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t,
-              class... Args>
-    //class = typename std::enable_if<is_registered_device_t<device_t>()>::type>
-    void invoke_proxy(device_t* obj, Args&&... args) {
-        static_assert(is_one_of<device_t, proxy_observer_device_t...>::value, "Unsupported");
-        static_assert(std::is_base_of<proxy_observer<device_t>, device_t>::value,
-                      "Only `proxy_observer` derived class can invoke context");
-
-        get_this()->invoke_ctx_observer(
-            obj,
-            // TODO std::integral_constant<ccl::group_split_type, group_id>{},
-            std::integral_constant<ccl::device_topology_type, class_id>{},
-            std::forward<Args>(args)...);
-    }
-
-    // helpers
-    template <ccl::device_topology_type specific_type, ccl::device_topology_type... class_id>
-    static observables_types<specific_type>& get_types(
-        observable_topologies<class_id...>& tops) noexcept {
-        return ccl_tuple_get<observables_types<specific_type>>(tops);
-    }
-
-    template <class observer_device_t, ccl::device_topology_type specific_type>
-    container_t<observer_device_t>& get_container(
-        observables_types<specific_type>& types) noexcept {
-        return ccl_tuple_get<container_t<observer_device_t>>(types);
-    }
-
-    template <class observer_device_t,
-              ccl::device_topology_type specific_type,
-              ccl::device_topology_type... class_id>
-    container_t<observer_device_t>& get_types_container(
-        observable_topologies<class_id...>& tops) noexcept {
-        return get_container<observer_device_t>(get_types<specific_type>(tops));
-    }
-
-    template <ccl::device_topology_type specific_type, ccl::device_topology_type... class_id>
-    static indexed_observables_types<specific_type>& get_types(
-        indexed_observable_topologies<class_id...>& tops) noexcept {
-        return ccl_tuple_get<indexed_observables_types<specific_type>>(tops);
-    }
-
-    template <class observer_device_t, ccl::device_topology_type specific_type>
-    indexed_container_t<observer_device_t>& get_container(
-        indexed_observables_types<specific_type>& types) noexcept {
-        return ccl_tuple_get<indexed_container_t<observer_device_t>>(types);
-    }
-
-    template <class observer_device_t,
-              ccl::device_topology_type specific_type,
-              ccl::device_topology_type... class_id>
-    indexed_container_t<observer_device_t>& get_types_container(
-        indexed_observable_topologies<class_id...>& tops) noexcept {
-        return get_container<observer_device_t>(get_types<specific_type>(tops));
-    }
-};
-
-namespace detail {
-
-struct actor_visitor {
-    template <class device_t, class actor_t>
-    void operator()(device_thread_map<device_t, actor_t>& actors, actor_t* subscriber) {
-        for (auto& a : actors) {
-            a.second->subscribe_on(subscriber);
-        }
-    }
-};
-
-template <class message_type, class mailbox_message_type>
-struct actor_publisher {
-    template <class device_t, class... message_args>
-    void operator()(
-        device_thread_map<device_t, subscribed_actor<message_type, mailbox_message_type>>& actors,
-        size_t topic_tag,
-        size_t publisher_id,
-        message_args&&... args) {
-        for (auto& a : actors) {
-            a.second->put_message(publisher_id, topic_tag, std::forward<message_args>(args)...);
-        }
-    }
-};
-} // namespace detail
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/context_barrier.hpp b/src/common/comm/l0/context/context_barrier.hpp
deleted file mode 100644
index 5590fee6b..000000000
--- a/src/common/comm/l0/context/context_barrier.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <mutex>
-#include <condition_variable>
-
-namespace native {
-struct signal_context {
-    std::mutex thread_group_mutex;
-    std::condition_variable thread_group_sync_condition;
-    bool communicator_ready = false;
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/device_group_ctx.cpp b/src/common/comm/l0/context/device_group_ctx.cpp
deleted file mode 100644
index b6746911f..000000000
--- a/src/common/comm/l0/context/device_group_ctx.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <sstream>
-
-#include "common/comm/l0/devices/devices_declaration.hpp"
-#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
-#include "common/comm/l0/context/device_group_ctx.hpp"
-#include "common/comm/l0/context/device_storage.hpp"
-#include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
-#include "common/comm/l0/device_community_holder_impl.hpp"
-
-#include "common/comm/l0/scheduler/device_group_scheduler.hpp"
-
-namespace native {
-
-std::shared_ptr<device_group_context> device_group_context::create(
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_type& group_device_ids,
-    device_storage& devices) {
-    std::shared_ptr<device_group_context> ret(
-        new device_group_context(comm_addr, group_device_ids));
-
-    //TODO More intellectual topology creation required
-    //Ring
-    {
-        device_group_ring_topology top(*ret, devices);
-
-        std::stringstream ss;
-        auto matrix = top.build_p2p_capability_matrix(ss, group_device_ids);
-        ss << "\nMatrix\n" << matrix << std::endl;
-
-        if (!top.build(ss, comm_addr, group_device_ids, matrix)) {
-            LOG_ERROR(
-                "Cannot build DEVICE_GROUP_RING. Devices cannot communicate for current setup!\nBuild log:\n",
-                ss.str());
-            abort();
-        }
-        LOG_DEBUG("Device Group Context for ",
-                  comm_addr.to_string(),
-                  " build RING topology. Log:\n ",
-                  ss.str());
-
-        /*        native::detail::printer<device_group_ring_topology::type()> p;
-        ccl_tuple_for_each(ring_device_topology->get_device_storage(), p);
-        LOG_INFO("Device Group ", context_addr.to_string(), " RING topology:\n", p.to_string());
-*/
-    }
-
-    //A2A
-    {
-        /* TODO
-        auto a2a_device_topology = std::make_shared<device_community<ccl::group_split_type::a2a_device_group>>(context_addr);
-        device_group_a2a_topology top(*this, plain_gpu_comms, ring_device_topology->get_device_storage_ptr());
-        std::stringstream ss;
-        auto matrix = top.build_p2p_capability_matrix(ss, group_device_ids);
-        ss << "\nMatrix\n" << matrix << std::endl;
-        if(!top.build(ss, 0, group_device_ids, matrix))
-        {
-            LOG_ERROR("Cannot build DEVICE_GROUP_RING. Devices cannot communicate for current setup!\nBuild log:\n", ss.str());
-            abort();
-        }
-        LOG_DEBUG("Device Group Context for ", context_addr.to_string(), " build RING topology. Log:\n ", ss.str());
-        native::detail::printer<device_group_ring_topology::type()> p;
-        ccl_tuple_for_each(ring_device_topology->get_device_storage(), p);
-        LOG_INFO("Device Group ", context_addr.to_string(), " RING topology:\n", p.to_string());
-        LOG_INFO("Device Group ", context_addr.to_string(), " A2A topology:\nTODO!");
-        //remember
-        std::get<ccl::device_topology_type::a2a>(device_topology) = a2a_device_topology;
-        */
-    }
-
-    return ret;
-}
-
-device_group_context::device_group_context(const ccl::context_comm_addr& comm_addr,
-                                           const ccl::device_indices_type& group_device_ids)
-        : scaling_context_base(),
-          device_indices(group_device_ids),
-          context_addr(comm_addr) {
-    //scheduler
-    scheduler_impl.reset(new device_group_scheduler);
-}
-
-device_group_context::~device_group_context() {}
-
-const ccl::device_indices_type& device_group_context::get_group_device_indices() const {
-    return device_indices;
-}
-
-device_group_context::scaling_context_base& device_group_context::get_numa_ctx() {
-    return *this;
-}
-const device_group_context::scaling_context_base& device_group_context::get_numa_ctx() const {
-    return *this;
-}
-} // namespace native
diff --git a/src/common/comm/l0/context/device_group_ctx.hpp b/src/common/comm/l0/context/device_group_ctx.hpp
deleted file mode 100644
index f0fa9ddc0..000000000
--- a/src/common/comm/l0/context/device_group_ctx.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <condition_variable>
-#include <map>
-#include <memory>
-#include <mutex>
-
-#include "oneapi/ccl/types.hpp"
-#include "supported_topologies.hpp"
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
-#include "common/comm/l0/device_community_holder_impl.hpp"
-
-class device_group_router;
-namespace native {
-struct device_storage;
-/*
-template<ccl::group_split_type>
-struct device_community;
-
-template<ccl::group_split_type type>
-using device_community_ptr = std::shared_ptr<device_community<type>>;
-
-template<ccl::group_split_type ...types>
-using device_community_tuple_t = std::tuple<device_community_ptr<types>...>;
-*/
-struct device_group_scheduler;
-/*
-template<ccl::group_split_type,
-         ccl::device_topology_type...>
-struct device_group_community_holder;
-*/
-struct device_group_context : numa_ctx<device_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST> {
-    using scaling_context_base =
-        numa_ctx<device_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-
-    friend class device_group_ring_topology;
-
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::thread;
-    }
-
-    using topologies = device_group_community_holder<ccl::group_split_type::thread,
-                                                     SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-
-    ccl::device_indices_type device_indices;
-    topologies device_topology;
-
-    template <ccl::device_topology_type class_id>
-    typename std::tuple_element<class_id, typename topologies::device_topologies_t>::type&
-    get_group_topology() {
-        return device_topology.get_community<class_id>();
-    }
-
-    ~device_group_context();
-
-    static std::shared_ptr<device_group_context> create(
-        const ccl::context_comm_addr& comm_addr,
-        const ccl::device_indices_type& group_device_ids,
-        device_storage& devices);
-    const ccl::device_indices_type& get_group_device_indices() const;
-
-    ccl::context_comm_addr context_addr;
-    std::unique_ptr<device_group_scheduler> scheduler_impl;
-
-    scaling_context_base& get_numa_ctx();
-    const scaling_context_base& get_numa_ctx() const;
-
-private:
-    device_group_context(const ccl::context_comm_addr& comm_addr,
-                         const ccl::device_indices_type& device_mask);
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/device_storage.cpp b/src/common/comm/l0/context/device_storage.cpp
deleted file mode 100644
index de7b214bc..000000000
--- a/src/common/comm/l0/context/device_storage.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/devices/devices_declaration.hpp"
-#include "common/comm/l0/context/device_storage.hpp"
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-namespace native {
-
-std::shared_ptr<specific_plain_device_storage> device_storage::create_devices_by_indices(
-    size_t thread_id,
-    const ccl::device_indices_type& indices) {
-    std::shared_ptr<specific_plain_device_storage> out_devices =
-        std::make_shared<specific_plain_device_storage>();
-    size_t index_in_group = 0;
-    for (const auto& idx : indices) {
-        LOG_DEBUG("Assign device by id: ", idx, " from group size: ", indices.size());
-
-        try {
-            ccl_device_driver::device_ptr runtime_device = get_runtime_device(idx);
-            if (!runtime_device) {
-                throw std::runtime_error(std::string("Cannot find device by id: ") +
-                                         ccl::to_string(idx));
-            }
-
-            // find index in real devices at first
-            device_container<ccl_gpu_comm>& real_devices =
-                ccl_tuple_get<device_container<ccl_gpu_comm>>(gpu_device_storage);
-            auto real_it = real_devices.find(runtime_device->handle);
-            if (real_it == real_devices.end()) {
-                // first time requested device, mark it as real
-                std::get<ccl_gpu_comm::type_idx()>(*out_devices)
-                    .push_back(create_gpu_device<ccl_gpu_comm>(*runtime_device, index_in_group++));
-            }
-            else {
-                // real device wrapper created already, make virtual wrapper
-                auto& real = real_it->second;
-                std::get<ccl_virtual_gpu_comm::type_idx()>(*out_devices)
-                    .push_back(create_gpu_device<ccl_virtual_gpu_comm>(
-                        real->get_device(), index_in_group++, *real));
-            }
-        }
-        catch (const std::exception& ex) {
-            LOG_ERROR("Cannot create device: ", ex.what());
-            assert(false && "device_storage::create_devices_by_indices - exception");
-            throw;
-        }
-    }
-
-    // remember in exclusive threads ownership
-    bool inserted = thread_gpu_comms.insert({ thread_id, out_devices }).second;
-    if (!inserted) {
-        abort(); // TODO consider use-case
-    }
-    return out_devices;
-}
-
-size_t device_storage::get_storage_size() const {
-    return detail::get_aggregated_size<specific_device_storage, SUPPORTED_DEVICES_DECL_LIST>(
-        gpu_device_storage); /*
-        return get_size<ccl_gpu_comm>() +
-               get_size<ccl_ipc_gpu_comm>() +
-               get_size<ccl_virtual_gpu_comm>() +
-               get_size<ccl_thread_comm<ccl_gpu_comm>>() +
-               get_size<ccl_thread_comm<ccl_virtual_gpu_comm>>() +
-               get_size<ccl_ipc_source_gpu_comm<ccl_gpu_comm>>() +
-               get_size<ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>() +
-               get_size<ccl_ipc_gpu_comm>();
-*/
-}
-/*
-    template<class DeviceType, class ...Types>
-    size_t get_aggregated_size() const
-    {
-        return get_size<DeviceType>() + detail::get_aggregated_size_helper<Types...>(gpu_device_storage);
-    }
-*/
-} // namespace native
diff --git a/src/common/comm/l0/context/device_storage.hpp b/src/common/comm/l0/context/device_storage.hpp
deleted file mode 100644
index e1363f884..000000000
--- a/src/common/comm/l0/context/device_storage.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <mutex>
-#include "common/utils/spinlock.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-
-namespace native {
-/*
- * Class device_storage:
- * used for typed device wrapper creation during topology construction.
- * It must remember all created device wrappers and must guarantee uniqueness
- * for exclusive device wrappers (REAL devices) and their clones (VIRTUAL devices)
- * Every time when a new device is created it wrapped into REAL deice type,
- * all other request to create the same device must be wrapped into VIRTUAL devices.
- *
- * Guarantee must be applied across threads
- */
-struct device_storage {
-    size_t get_storage_size() const;
-
-    template <class device_t>
-    size_t get_size() const {
-        return ccl_tuple_get<device_container<device_t>>(gpu_device_storage).size();
-    }
-
-    // request to create (or reuse) device wrappers by 'indices' for specific thread 'thread_id'
-    // device_storage will automatically determine wrapper types inside
-    // Result is a shared vector, which is remembered in per-thread storage
-    std::shared_ptr<specific_plain_device_storage> create_devices_by_indices(
-        size_t thread_id,
-        const ccl::device_indices_type& indices);
-
-    // creation specific device type, determined from 'create_devices_by_indices'
-    template <class device_t, class... Args>
-    device_t_ptr<device_t> create_gpu_device(ccl_device& device, size_t ranks, Args&&... args) {
-        //break compiler for 'device_t' constructible check
-        static_assert(std::is_constructible<device_t,
-                                            typename std::add_lvalue_reference<ccl_device>::type,
-                                            size_t,
-                                            Args...>::value,
-                      "Object of class 'device_t' is not constructible from given arguments");
-        std::shared_ptr<device_t> gpu_instance =
-            std::make_shared<device_t>(device, ranks, std::forward<Args>(args)...);
-
-        //put in global storage: to determine device uniqueness and wrapper type
-        auto& gpus = ccl_tuple_get<device_container<device_t>>(gpu_device_storage);
-        gpus.emplace(std::piecewise_construct,
-                     std::forward_as_tuple(device.handle),
-                     std::forward_as_tuple(gpu_instance));
-
-        {
-            // put in indexed storage
-            auto acc = get_node_storage();
-            specific_indexed_device_storage& global_storage = acc.get();
-            indexed_device_container<device_t>& device_cont =
-                ccl_tuple_get<indexed_device_container<device_t>>(global_storage);
-            device_cont.insert({ ranks, gpu_instance });
-        }
-        return gpu_instance;
-    }
-
-    specific_device_storage gpu_device_storage; // wrapper type determine helper storage
-    using thread_plain_device_map =
-        std::map<size_t, std::shared_ptr<specific_plain_device_storage>>;
-    thread_plain_device_map thread_gpu_comms; // devices allocated in exclusive thread ownership
-
-    struct accessor {
-        accessor(ccl_spinlock& mutex, specific_indexed_device_storage& storage)
-                : lock(mutex),
-                  inner_data(storage) {}
-        accessor(accessor&& src) = default;
-        accessor& operator=(accessor&& src) = delete;
-
-        specific_indexed_device_storage& get() {
-            return inner_data;
-        }
-
-    private:
-        std::unique_lock<ccl_spinlock> lock;
-        specific_indexed_device_storage& inner_data;
-    };
-
-    accessor get_node_storage() {
-        return accessor(node_storage_mutex, node_rank_device_storage);
-    }
-
-private:
-    ccl_spinlock node_storage_mutex;
-    specific_indexed_device_storage node_rank_device_storage;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/context/process_group_ctx.cpp b/src/common/comm/l0/context/process_group_ctx.cpp
deleted file mode 100644
index e30ac8448..000000000
--- a/src/common/comm/l0/context/process_group_ctx.cpp
+++ /dev/null
@@ -1,815 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <fstream>
-#include <vector>
-#include <sstream>
-#include <iterator>
-#include <set>
-#include <unistd.h>
-#include <limits.h>
-#include <gnu/libc-version.h>
-
-#include "oneapi/ccl.hpp"
-#include "common/comm/l0/devices/devices_declaration.hpp"
-
-#include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/context/process_group_ctx.hpp"
-#include "common/comm/l0/device_community_holder_impl.hpp"
-#include "common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp"
-#include "common/comm/l0/topology/topology_serializer.hpp"
-#include "common/comm/l0/context/device_storage.hpp"
-#include "common/comm/l0/scheduler/thread_group_scheduler.hpp"
-#include "common/comm/l0/scheduler/allied_process_group_scheduler.hpp"
-
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp"
-
-namespace native {
-
-process_group_context::process_group_context(std::shared_ptr<ccl::host_communicator> comm)
-        : ccl_communicator(comm),
-          thread_group_ctx(new thread_group_context),
-          gpu_device_storage(new device_storage) {
-    if (!ccl_communicator) {
-        LOG_ERROR("Process context need non-empty communicator");
-        throw std::runtime_error("Process context need non-empty communicator");
-    }
-    process_idx = ccl_communicator->rank();
-    process_count = ccl_communicator->size();
-
-    //Get current hostname
-    char hostname[HOST_NAME_MAX];
-    int ret = gethostname(hostname, HOST_NAME_MAX);
-    if (ret == -1 && (errno == ENAMETOOLONG || errno == EINVAL)) {
-        assert(std::string(gnu_get_libc_version()) == "2.2" && "Cannot gethostname");
-        hostname[HOST_NAME_MAX - 1] = '\0';
-        std::cerr << "Hostname truncated: " << hostname << std::endl;
-    }
-    my_host_name = hostname;
-}
-
-process_group_context::~process_group_context() {}
-
-bool process_group_context::delegate_sync(const ccl::device_indices_type& thread_device_indices,
-                                          ccl::context_comm_addr& comm_addr) {
-    // set thread id sequencially
-    //comm_addr.thread_idx = process_device_topology.size();
-
-    // prepare device communities
-    auto& ring_container = process_device_topology[comm_addr.thread_idx]
-                               .get_community<ccl::device_topology_type::ring>();
-    (void)ring_container;
-
-    auto& a2a_container = process_device_topology[comm_addr.thread_idx]
-                              .get_community<ccl::device_topology_type::a2a>();
-    a2a_container.set_topology(
-        std::make_shared<device_community<ccl::device_topology_type::a2a>>(comm_addr));
-
-    // sync all threads at first - blocking operation
-    return thread_group_ctx->sync_barrier(thread_device_indices, comm_addr, *gpu_device_storage);
-}
-
-bool process_group_context::sync_barrier(const ccl::device_mask_t& thread_device_mask,
-                                         ccl::context_comm_addr& comm_addr) {
-    return sync_barrier(ccl_device_driver::get_device_indices(thread_device_mask), comm_addr);
-}
-
-bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_device_indices,
-                                         ccl::context_comm_addr& comm_addr) {
-    // sync all threads at first - blocking operation
-    if (!delegate_sync(thread_device_indices, comm_addr)) {
-        return false;
-    }
-
-    //barrie mutex is locked by MASTER thread
-    const ccl::process_device_indices_type& thread_indices =
-        thread_group_ctx->get_thread_group_device_indices();
-
-    LOG_DEBUG("Process (",
-              process_idx,
-              "/",
-              process_count,
-              ") reached process group communicator barrier");
-
-    ccl::device_indices_type process_aggregated_device_indices =
-        std::accumulate(thread_indices.begin(),
-                        thread_indices.end(),
-                        ccl::device_indices_type(),
-                        [](ccl::device_indices_type& partial_indices,
-                           const typename ccl::process_device_indices_type::value_type& val) {
-                            partial_indices.insert(val.second.begin(), val.second.end());
-                            return partial_indices;
-                        });
-    build_cluster_affinity_table(process_aggregated_device_indices);
-
-    //iterate over allied processes(on the same host)
-    //find possible IPC device with P2P capability
-    LOG_DEBUG(
-        "Process (", process_idx, "/", process_count, ") starts hardware topologies creation");
-
-    /* TODO -S- enable it later
-    cluster_group_device_creator ally_process_topology(
-        process_idx, process_count, *this, *gpu_device_storage);
-    */
-    {
-        LOG_DEBUG("TODO - Limitation on node processes considered!!!\n"
-                  "process_idx: ",
-                  process_idx,
-                  ", process_count: ",
-                  process_count,
-                  ", cluster_device_rank_offset: ",
-                  cluster_device_rank_offset,
-                  ", cluster_device_size: ",
-                  cluster_device_size);
-        //TODO -S- Temporary solution for IPC topology
-        allied_process_group_ring_topology ally_process_topology(process_idx,
-                                                                 process_count,
-                                                                 *this,
-                                                                 *gpu_device_storage,
-                                                                 cluster_device_rank_offset,
-                                                                 cluster_device_size,
-                                                                 comm_addr);
-
-        const ccl::process_device_indices_type& node_mask = get_node_afinity_indices(get_host_id());
-
-        std::stringstream ss;
-        detail::adjacency_matrix p2p_dependency_graph =
-            ally_process_topology.build_p2p_capability_matrix(ss, node_mask);
-        ss << "\nMatrix\n" << p2p_dependency_graph << std::endl;
-
-        if (!ally_process_topology.build_all(
-                ss, thread_group_ctx->get_thread_group_device_indices(), p2p_dependency_graph)) {
-            LOG_ERROR(
-                ss.str(), "\nCannot build cluster global ring! Abort. Build Log:\n", ss.str());
-            abort();
-        }
-
-        LOG_DEBUG("Build cluster global ring successfully. Log:\n", ss.str());
-    }
-
-    {
-        //TODO Create A2A topology
-        LOG_DEBUG("Process Context Topologies A2A TODO");
-    }
-
-    // create scheduler
-    LOG_DEBUG("Create scheduler");
-    scheduler_impl.reset(new allied_process_group_scheduler(
-        process_count, comm_addr.thread_count, ccl_communicator, *gpu_device_storage));
-
-    // initialize observer contexts
-    LOG_DEBUG("Sync communicator barrier");
-    ccl_communicator->barrier({}, ccl::default_barrier_attr);
-
-    LOG_DEBUG("initialize IPC context");
-    get_ipc_ctx().initialize_ctx(ccl_communicator);
-
-    LOG_DEBUG("initialize SCALE-OUT context");
-    get_scaleout_ctx().initialize_ctx(ccl_communicator);
-
-    // dump topology
-    std::stringstream out;
-    dump_process_topologies(out);
-
-    LOG_DEBUG("Thread (MASTER): ",
-              comm_addr.thread_idx,
-              " finalized process topology creation:\n",
-              out.str());
-    return true;
-}
-
-std::shared_ptr<thread_group_context> process_group_context::get_thread_context(size_t process_id) {
-    (void)process_id;
-    return thread_group_ctx;
-}
-/*
-std::shared_ptr<process_group_context::ring_topology>& process_group_context::get_process_ring_topology(size_t process_id, size_t thread_id)
-{
-    (void)process_id;
-    auto per_thread_top = process_ring_topology.find(thread_id);
-    if(per_thread_top == process_ring_topology.end())
-    {
-        LOG_ERROR("No process topologies for ",thread_id, ".Empty topology");
-        static std::shared_ptr<process_group_context::ring_topology> empty;
-        return empty;
-    }
-    return per_thread_top->second;
-}
-*/
-
-std::shared_ptr<ccl::host_communicator> process_group_context::get_communicator() {
-    return ccl_communicator;
-}
-
-bool process_group_context::build_cluster_affinity_table(
-    const ccl::device_indices_type& process_aggregated_device_indices) {
-    LOG_DEBUG("Node: ", my_host_name, " start build affinity table for process idx: ", process_idx);
-
-    //create cluster mask affinity
-    //1) request hostname & device indices count
-    size_t send_hostname_size = my_host_name.size();
-    std::vector<size_t> receive_hostname_sizes(ccl_communicator->size());
-    std::vector<size_t> recv_counts(ccl_communicator->size(), 1);
-
-    size_t send_process_indices_count = process_aggregated_device_indices.size();
-    std::vector<size_t> receive_process_indices_sizes(ccl_communicator->size());
-    std::vector<size_t> recv_process_indices_counts(ccl_communicator->size(), 1);
-
-    constexpr size_t hostname_indices_requests_count = 2;
-    std::vector<ccl::event> requests;
-    requests.reserve(hostname_indices_requests_count);
-    {
-        ccl::stream::impl_value_t empty_stream{};
-        requests.push_back(ccl_communicator->allgatherv_impl(&send_hostname_size,
-                                                             1,
-                                                             receive_hostname_sizes.data(),
-                                                             recv_counts,
-                                                             empty_stream,
-                                                             ccl::default_allgatherv_attr,
-                                                             {}));
-        LOG_TRACE("Request hostname sizes, process (",
-                  ccl_communicator->rank(),
-                  "/",
-                  ccl_communicator->size(),
-                  ") has own hostname: ",
-                  my_host_name,
-                  ", size: ",
-                  send_hostname_size);
-
-        requests.push_back(ccl_communicator->allgatherv_impl(&send_process_indices_count,
-                                                             1,
-                                                             receive_process_indices_sizes.data(),
-                                                             recv_process_indices_counts,
-                                                             empty_stream,
-                                                             ccl::default_allgatherv_attr,
-                                                             {}));
-        LOG_TRACE("Request device indices sizes, process (",
-                  ccl_communicator->rank(),
-                  "/",
-                  ccl_communicator->size(),
-                  ") has own indices count: ",
-                  send_process_indices_count);
-    }
-
-    //wait for completion
-    for (auto& req : requests) {
-        req.wait();
-    }
-
-    size_t total_hostname_size =
-        std::accumulate(receive_hostname_sizes.begin(), receive_hostname_sizes.end(), 0);
-    LOG_DEBUG("Memory required for hostnames size: ", total_hostname_size, " bytes");
-
-    size_t total_device_indices_count = std::accumulate(
-        receive_process_indices_sizes.begin(), receive_process_indices_sizes.end(), 0);
-    LOG_DEBUG("Memory required for device indices size: ", total_device_indices_count, " count");
-
-    //TODO -S- temporary START
-    //calculate rank offset and total device count in cluster
-    {
-        auto my_rank_mask_size_it = receive_process_indices_sizes.begin();
-        std::advance(my_rank_mask_size_it, ccl_communicator->rank());
-        cluster_device_rank_offset =
-            std::accumulate(receive_process_indices_sizes.begin(), my_rank_mask_size_it, 0);
-        cluster_device_size = std::accumulate(
-            my_rank_mask_size_it, receive_process_indices_sizes.end(), cluster_device_rank_offset);
-    }
-    LOG_DEBUG("Process idx: ",
-              ccl_communicator->rank(),
-              ", device rank offset: ",
-              cluster_device_rank_offset,
-              ", total device count: ",
-              cluster_device_size);
-    //TODO -S- temporary END
-
-    //Serialize own devices path data
-    auto serialized_indices = detail::serialize::device_path_serializer::serialize_indices(
-        process_aggregated_device_indices);
-    // TODO assert(serialized_indices.size() == receive_process_indices_sizes[process_idx] && "Indices unexpected count");
-
-    decltype(serialized_indices) affinity_indices;
-    std::vector<char> hostnames;
-    auto indices_count_to_bytes_converter = [](size_t elements) -> size_t {
-        return elements * detail::serialize::device_path_serializable::device_index_size();
-    };
-
-    try {
-        requests.clear();
-        hostnames.resize(total_hostname_size);
-
-        ccl::stream::impl_value_t empty_stream{};
-        requests.push_back(ccl_communicator->allgatherv_impl((int8_t*)my_host_name.data(),
-                                                             send_hostname_size,
-                                                             (int8_t*)hostnames.data(),
-                                                             receive_hostname_sizes,
-                                                             empty_stream,
-                                                             ccl::default_allgatherv_attr,
-                                                             {}));
-        LOG_TRACE("Submit request for hostnames. Process (",
-                  ccl_communicator->rank(),
-                  "/",
-                  ccl_communicator->size(),
-                  ")"
-                  " has own hostname: ",
-                  my_host_name);
-
-        //TODO Reorder requests!
-
-        //need to convert to bytes to satisfy serialized data type
-        affinity_indices.resize(indices_count_to_bytes_converter(total_device_indices_count));
-        std::transform(receive_process_indices_sizes.begin(),
-                       receive_process_indices_sizes.end(),
-                       receive_process_indices_sizes.begin(),
-                       indices_count_to_bytes_converter);
-        requests.push_back(ccl_communicator->allgatherv_impl(
-            reinterpret_cast<const int8_t*>(serialized_indices.data()),
-            serialized_indices.size(),
-            reinterpret_cast<int8_t*>(affinity_indices.data()),
-            receive_process_indices_sizes,
-            empty_stream,
-            ccl::default_allgatherv_attr,
-            {}));
-        LOG_TRACE("Submit request for affinity masks. Process (",
-                  ccl_communicator->rank(),
-                  "/",
-                  ccl_communicator->size(),
-                  ")"
-                  " has own mask size: ",
-                  serialized_indices.size());
-    }
-    catch (std::exception& ex) {
-        LOG_ERROR("Cannot submit requests: ", ex.what());
-        LOG_DEBUG("Memory required for hostnames size: ", total_hostname_size, " bytes");
-        LOG_DEBUG(
-            "Memory required for device indices size: ", total_device_indices_count, " count");
-        abort();
-    }
-
-    //wait for completion
-    for (auto& req : requests) {
-        req.wait();
-    }
-
-    //parse hostnames
-    size_t rank_index = 0;
-    auto name_from_iterator = hostnames.begin();
-    auto affinity_mask_from_iterator = affinity_indices.begin();
-    for (auto rank_hostname_size = receive_hostname_sizes.begin();
-         rank_hostname_size != receive_hostname_sizes.end();
-         ++rank_hostname_size) {
-        //check hostnames
-        if ((size_t)std::distance(name_from_iterator, hostnames.end()) < *rank_hostname_size) {
-            LOG_ERROR("Received hostnames data is too short: ",
-                      hostnames.size(),
-                      " expected: ",
-                      std::distance(name_from_iterator, hostnames.end()) + *rank_hostname_size);
-            abort();
-        }
-
-        //get hostaname
-        std::string hostname(name_from_iterator, name_from_iterator + *rank_hostname_size);
-        //shift hostname data
-        std::advance(name_from_iterator, *rank_hostname_size);
-
-        //check affinity
-        if ((size_t)std::distance(affinity_mask_from_iterator, affinity_indices.end()) <
-            receive_process_indices_sizes[rank_index]) {
-            LOG_ERROR("Received affinity_masks data is too short: ",
-                      affinity_indices.size(),
-                      " expected at least: ",
-                      receive_process_indices_sizes[rank_index]);
-            abort();
-        }
-
-        //get affinity
-        ccl::device_indices_type rank_indices = detail::serialize::device_path_deserializer::
-            deserialize_indices<std::multiset, ccl::device_index_type>(
-                affinity_mask_from_iterator,
-                affinity_mask_from_iterator + receive_process_indices_sizes[rank_index]);
-        std::advance(affinity_mask_from_iterator, receive_process_indices_sizes[rank_index]);
-
-        {
-            std::stringstream ss;
-            for (const auto& path : rank_indices) {
-                ss << path << ", ";
-            }
-            LOG_DEBUG(
-                "Collected hostname: ", hostname, ", rank: ", rank_index, ", affinity: ", ss.str());
-        }
-
-        //fill global mask
-        set_node_afinity_indices(hostname, rank_index, rank_indices);
-        LOG_DEBUG("Global affinity mask nodes count: ", cluster_gpu_indices.size());
-        //next
-        rank_index++;
-    }
-
-    {
-        std::stringstream ss;
-        process_group_context::dump_cluster_affinity_indices(cluster_gpu_indices, ss);
-        LOG_DEBUG("Cluster device affinity indices table: ", ss.str());
-    }
-
-    return true;
-}
-
-const ccl::host_id process_group_context::get_host_id() const {
-    return my_host_name;
-}
-
-const ccl::cluster_aggregated_device_mask_t& process_group_context::get_afinity_mask() const {
-    return global_mask;
-}
-const ccl::cluster_device_indices_type& process_group_context::get_affinity_indices() const {
-    return cluster_gpu_indices;
-}
-
-const ccl::process_aggregated_device_mask_t& process_group_context::get_node_afinity_mask(
-    const ccl::host_id& host) const {
-    auto it = global_mask.find(host);
-    if (it == global_mask.end()) {
-        LOG_ERROR("Cannot get affinity mask for node: ", host);
-        static const ccl::process_aggregated_device_mask_t empty;
-        return empty;
-    }
-    return it->second;
-}
-
-const ccl::process_device_indices_type& process_group_context::get_node_afinity_indices(
-    const ccl::host_id& host) const {
-    auto it = cluster_gpu_indices.find(host);
-    if (it == cluster_gpu_indices.end()) {
-        LOG_ERROR("Cannot get affinity indices for node: ", host);
-        static const ccl::process_device_indices_type empty;
-        return empty;
-    }
-    return it->second;
-}
-
-void process_group_context::set_node_afinity_indices(const ccl::host_id& host,
-                                                     int rank_id,
-                                                     const ccl::device_indices_type& indices) {
-    /*
-    ccl::device_mask_t rank_mask = ccl_device_driver::get_device_mask(indices);
-    auto& per_host_mask = global_mask[host];
-    auto process_it = per_host_mask.find(rank_id);
-    if(process_it != per_host_mask.end())
-    {
-        LOG_DEBUG("Current host rank received");
-        CCL_ASSERT(process_it->first == process_idx, "Self consistency rank id check failed");
-        CCL_ASSERT(process_it->second == rank_mask, "Self consistency mask check failed");
-    }
-    else
-    {
-        LOG_DEBUG("Hostname: ", host, ", updated rank: ", rank_id, ", affinity: ", rank_mask.to_string());
-        per_host_mask[rank_id] = rank_mask;
-    }
-*/
-    //TODO for indices
-    auto& per_host_indices = cluster_gpu_indices[host];
-    auto process_ind_it = per_host_indices.find(rank_id);
-    if (process_ind_it != per_host_indices.end()) {
-        LOG_DEBUG("Current host rank received");
-        CCL_ASSERT(process_ind_it->first == process_idx, "Self consistency rank id check failed");
-        CCL_ASSERT(process_ind_it->second == indices, "Self consistency indices check failed");
-    }
-    else {
-        LOG_DEBUG(
-            "Hostname: ", host, ", updated rank: ", rank_id, ", affinity size: ", indices.size());
-        per_host_indices[rank_id] = indices;
-    }
-}
-
-device_storage& process_group_context::get_device_storage() {
-    CCL_ASSERT(gpu_device_storage, "Device storage must exist");
-    return *gpu_device_storage;
-}
-
-/*
-std::tuple<bool, std::string> process_group_context::check_device_mask_validity_across_allied_processes(ccl::process_aggregated_device_mask_t& allied_processes_mask)
-{
-    std::string descr;
-    //temporary indices collection
-    std::multiset<typename indices::value_type> expected_dupliated_indices;
-    //fill duplicated devices indices across allied processes(on the same host)
-    for(const auto& proc_mask : allied_processes_mask)
-    {
-        //user merge in c++17
-        indices tmp = ccl_device_driver::get_device_indices(proc_mask.second);
-        expected_dupliated_indices.insert(tmp.begin(), tmp.end());
-    }
-    //find duplicates
-    indices duplicates;
-    for(auto it = expected_dupliated_indices.begin(); it != expected_dupliated_indices.end(); ++it)
-    {
-        auto cnt = expected_dupliated_indices.count(*it);
-        if(cnt != 1) //not unique device index across processes
-        {
-            duplicates.insert(*it);
-        }
-    }
-    bool ret = true;
-    if(!duplicates.empty())
-    {
-        ret = false;
-        std::stringstream ss;
-        ss << "Duplicated device ids: ";
-        std::copy(duplicates.begin(), duplicates.end(), std::ostream_iterator<typename indices::value_type>(ss, ", "));
-        descr = ss.str();
-    }
-    return { ret, descr };
-}
-*/
-
-void process_group_context::dump_cluster_affinity_indices(
-    const ccl::cluster_device_indices_type& indices,
-    std::ostream& out) {
-    out << "Cluster nodes: " << indices.size() << "\n";
-    for (const auto& node_indices : indices) {
-        dump_node_aggregated_indices(node_indices.first, node_indices.second, out);
-        out << std::endl;
-    }
-}
-
-void process_group_context::dump_node_aggregated_mask(
-    const std::string& node_name,
-    const ccl::process_aggregated_device_mask_t& mask,
-    std::ostream& out) {
-    out << "Node: " << node_name << ", processes: " << mask.size() << "\n";
-    for (const auto& proc_mask : mask) {
-        dump_process_mask(proc_mask.first, proc_mask.second, out);
-        out << std::endl;
-    }
-}
-void process_group_context::dump_node_aggregated_indices(
-    const std::string& node_name,
-    const ccl::process_device_indices_type& indices,
-    std::ostream& out) {
-    if (!node_name.empty()) {
-        out << "Node: " << node_name << ", processes: " << indices.size() << "\n";
-    }
-    else {
-        out << "Processes: " << indices.size() << "\n";
-    }
-
-    for (const auto& proc_idxs : indices) {
-        dump_process_indices(proc_idxs.first, proc_idxs.second, out);
-        out << std::endl;
-    }
-}
-
-void process_group_context::dump_process_mask(size_t process_id,
-                                              const ccl::device_mask_t& mask,
-                                              std::ostream& out) {
-    out << "Process idx: " << process_id << ", affinity: " << mask.to_string();
-}
-
-void process_group_context::dump_process_indices(size_t process_id,
-                                                 const ccl::device_indices_type& indices,
-                                                 std::ostream& out) {
-    out << "Process idx: " << process_id << ", affinity: ";
-    for (const auto& path : indices) {
-        out << path << ", ";
-    }
-}
-
-std::string process_group_context::to_string() const {
-    auto my_processes_it = global_mask.find(my_host_name);
-    CCL_ASSERT(my_processes_it == global_mask.end(), "global mask is inconsistend!");
-
-    std::stringstream out;
-    out << "My info:\nHost: " << my_host_name << ", processes: " << my_processes_it->second.size();
-    process_group_context::dump_cluster_affinity_mask(global_mask, out);
-    return out.str();
-}
-
-void process_group_context::dump_cluster_affinity_mask(
-    const ccl::cluster_aggregated_device_mask_t& mask,
-    std::ostream& out) {
-    out << "Cluster nodes: " << mask.size() << "\n";
-    for (const auto& node_mask : mask) {
-        dump_node_aggregated_mask(node_mask.first, node_mask.second, out);
-        out << std::endl;
-    }
-}
-
-void process_group_context::dump_process_topologies(std::ostream& out) const {
-    out << "Process threads count: " << process_device_topology.size() << std::endl;
-    for (auto it = process_device_topology.begin(); it != process_device_topology.end(); ++it) {
-        const auto& top = it->second;
-        size_t thread = it->first;
-
-        out << "\nProcess Thread Group: " << thread << " topology:\n" << top.to_string();
-    }
-}
-
-std::vector<ccl::device_indices_type> process_group_context::get_ipc_device_indices() const {
-    std::stringstream ss;
-    ccl::process_device_indices_type node_mask_to_reorder = get_node_afinity_indices(get_host_id());
-    if (node_mask_to_reorder.empty()) {
-        ss << "process_group_context::get_ipc_device_indices failed: empty process affinities for hostname: "
-           << get_host_id() << ", cluster topology:\n";
-        process_group_context::dump_cluster_affinity_indices(cluster_gpu_indices, ss);
-        const std::string& err = ss.str();
-        LOG_ERROR("Error in ", err);
-        throw std::runtime_error(err);
-    }
-
-    std::vector<ccl::device_indices_type> ipc_device_indices;
-    try {
-        ipc_device_indices =
-            process_group_context::get_ipc_device_indices_for_id(process_idx, node_mask_to_reorder);
-    }
-    catch (const std::exception& ex) {
-        ss << ex.what() << ", cluster topology:\n";
-        process_group_context::dump_cluster_affinity_indices(cluster_gpu_indices, ss);
-        const std::string& err = ss.str();
-        LOG_ERROR("Error in ", err);
-        throw;
-    }
-    return ipc_device_indices;
-}
-
-std::vector<ccl::device_indices_type> process_group_context::get_ipc_device_indices_for_id(
-    size_t process_idx,
-    ccl::process_device_indices_type node_indices) {
-    std::stringstream ss;
-    auto my_process_it = node_indices.find(process_idx);
-    if (my_process_it == node_indices.end()) {
-        ss << "No process id: " << process_idx << " in node affinities: ";
-        process_group_context::dump_node_aggregated_indices("", node_indices, ss);
-        const std::string& err = ss.str();
-        LOG_ERROR(err);
-        throw std::runtime_error(err);
-    }
-
-    node_indices.erase(my_process_it); //self indices erase, other are ipc
-
-    std::vector<ccl::device_indices_type> ipc_device_indices;
-    for (const auto& mask : node_indices) {
-        ipc_device_indices.push_back(mask.second);
-    }
-    return ipc_device_indices;
-}
-
-void process_group_context::collect_cluster_colored_plain_graphs(
-    const detail::colored_plain_graph_list& send_graph,
-    detail::global_sorted_colored_plain_graphs& out_global_graphs) {
-    using namespace detail::serialize;
-
-    LOG_DEBUG("Collect cluster colored plain graphs, process initiator: ",
-              process_idx,
-              ", graphs count: ",
-              detail::to_string(send_graph));
-
-    // serialize current process graph list into bytes
-    device_path_serializable::raw_data_t my_serialized_graph =
-        device_path_serializer::serialize_indices(send_graph);
-
-    size_t send_count = my_serialized_graph.size();
-    std::vector<size_t> recv_counts_process_graph_sizes(ccl_communicator->size());
-    {
-        // collect graph lists size from cluster
-        std::vector<size_t> recv_counts(ccl_communicator->size(), 1);
-
-        LOG_DEBUG("Send graph lists size by process index: ",
-                  process_idx,
-                  ", serialized size: ",
-                  send_count);
-        ccl::stream::impl_value_t empty_stream{};
-        ccl_communicator
-            ->allgatherv_impl(&send_count,
-                              1,
-                              recv_counts_process_graph_sizes.data(),
-                              recv_counts,
-                              empty_stream,
-                              ccl::default_allgatherv_attr,
-                              {})
-            .wait();
-    }
-
-    size_t global_graph_data_size = std::accumulate(
-        recv_counts_process_graph_sizes.begin(), recv_counts_process_graph_sizes.end(), 0);
-
-    // collect cluster graph lists
-    device_path_serializable::raw_data_t recv_cluster_graphs;
-    try {
-        LOG_DEBUG(
-            "Send graph list by process index: ", process_idx, ", serialized size: ", send_count);
-
-        recv_cluster_graphs.resize(global_graph_data_size);
-        ccl::stream::impl_value_t empty_stream{};
-        ccl_communicator
-            ->allgatherv_impl(reinterpret_cast<int8_t*>(my_serialized_graph.data()),
-                              send_count,
-                              reinterpret_cast<int8_t*>(recv_cluster_graphs.data()),
-                              recv_counts_process_graph_sizes,
-                              empty_stream,
-                              ccl::default_allgatherv_attr,
-                              {})
-            .wait();
-    }
-    catch (const std::bad_alloc& ex) {
-        CCL_THROW_WITH_ERROR("Memory required for global_graph_data_size size: ",
-                             global_graph_data_size,
-                             " bytes\nException: ",
-                             ex.what());
-    }
-    catch (const std::exception& ex) {
-        CCL_THROW_WITH_ERROR("Cannot submit global-serialized-graph requests: ", ex.what());
-    }
-
-    size_t deserialized_bytes = 0;
-    size_t offset_bytes = 0;
-    size_t process_num = 0;
-
-    LOG_DEBUG("Deserialize recv_cluster_graphs");
-    try {
-        for (process_num = 0; process_num < static_cast<size_t>(ccl_communicator->size());
-             process_num++) {
-            detail::colored_plain_graph_list graph =
-                device_path_deserializer::deserialize_colored_graph_list_indices(
-                    recv_cluster_graphs, deserialized_bytes, offset_bytes);
-            LOG_DEBUG("Recevice process index: ",
-                      process_num,
-                      ", deserialized bytes: ",
-                      deserialized_bytes,
-                      ", by offset: ",
-                      offset_bytes,
-                      ", got partial graph: ",
-                      detail::to_string(graph));
-
-            offset_bytes += deserialized_bytes;
-            out_global_graphs.emplace(process_num, std::move(graph));
-        }
-    }
-    catch (const std::bad_alloc& ex) {
-        CCL_THROW_WITH_ERROR("Cannot deserialize recv_cluster_graphs for process num:",
-                             process_num,
-                             ", deserialized raw bytes: ",
-                             deserialized_bytes,
-                             ", processed raw bytes: ",
-                             offset_bytes,
-                             " \nException: ",
-                             ex.what());
-    }
-    catch (const std::exception& ex) {
-        CCL_THROW_WITH_ERROR("Cannot deserialize recv_cluster_graphs for process num:",
-                             process_num,
-                             ", deserialized raw bytes: ",
-                             deserialized_bytes,
-                             ", processed raw bytes: ",
-                             offset_bytes,
-                             " \nException: ",
-                             ex.what());
-    }
-
-    LOG_DEBUG("Global colored_graph deserialized on process id: ",
-              process_idx,
-              ". Graphs:\n",
-              detail::to_string(out_global_graphs));
-}
-
-process_group_context::numa_context_base& process_group_context::get_numa_ctx() {
-    return *this;
-}
-const process_group_context::numa_context_base& process_group_context::get_numa_ctx() const {
-    return *this;
-}
-process_group_context::scaleup_context_base& process_group_context::get_scaleup_ctx() {
-    return *this;
-}
-const process_group_context::scaleup_context_base& process_group_context::get_scaleup_ctx() const {
-    return *this;
-}
-process_group_context::scaleout_context_base& process_group_context::get_scaleout_ctx() {
-    return *this;
-}
-const process_group_context::scaleout_context_base& process_group_context::get_scaleout_ctx()
-    const {
-    return *this;
-}
-
-process_group_context::ipc_context_base& process_group_context::get_ipc_ctx() {
-    return *this;
-}
-const process_group_context::ipc_context_base& process_group_context::get_ipc_ctx() const {
-    return *this;
-}
-} // namespace native
diff --git a/src/common/comm/l0/context/process_group_ctx.hpp b/src/common/comm/l0/context/process_group_ctx.hpp
deleted file mode 100644
index 15791f7b2..000000000
--- a/src/common/comm/l0/context/process_group_ctx.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp"
-#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
-#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp"
-#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp"
-
-#include "common/comm/l0/context/scale/scaling_context_dispatcher.hpp"
-#include "common/comm/l0/topology/topology_declarations.hpp"
-namespace ccl {
-class host_communicator;
-}
-
-namespace native {
-struct device_storage;
-
-struct allied_process_group_scheduler;
-
-//TODO separate class on two: context & process device requestor
-struct process_group_context
-        : scaling_ctx_dispatcher<
-              numa_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>,
-              scale_up_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>,
-              scale_out_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>,
-              ipc_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>> {
-    using numa_context_base = numa_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-    using scaleup_context_base =
-        scale_up_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-    using scaleout_context_base =
-        scale_out_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-    using ipc_context_base = ipc_ctx<process_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-
-    friend class device_group_ring_topology;
-    friend class thread_group_ring_topology;
-    friend class cluster_group_device_creator;
-    friend class allied_process_group_ring_topology;
-
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::cluster;
-    }
-
-    using topologies = device_group_community_holder<ccl::group_split_type::cluster,
-                                                     SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-    using topologies_storage = std::map<size_t, topologies>;
-
-    process_group_context(std::shared_ptr<ccl::host_communicator> communicator);
-    virtual //TODO use stub
-        ~process_group_context();
-
-    bool sync_barrier(const ccl::device_indices_type& thread_device_indices,
-                      ccl::context_comm_addr& comm_addr);
-    bool sync_barrier(const ccl::device_mask_t& thread_device_mask,
-                      ccl::context_comm_addr& comm_addr);
-
-    std::shared_ptr<thread_group_context> get_thread_context(size_t process_id);
-
-    template <ccl::device_topology_type class_id>
-    typename std::tuple_element<class_id, typename topologies::device_topologies_t>::type&
-    get_process_topology(size_t process_id, size_t thread_id) {
-        auto it = process_device_topology.find(thread_id);
-        if (it == process_device_topology.end()) {
-            LOG_ERROR("Cannot find device group for process: ",
-                      process_id,
-                      ", thread: ",
-                      thread_id,
-                      ". Empty topology");
-            static
-                typename std::tuple_element<class_id,
-                                            typename topologies::device_topologies_t>::type empty;
-            return empty;
-        }
-        return it->second.get_community<class_id>();
-    }
-
-    const ccl::cluster_aggregated_device_mask_t& get_afinity_mask() const;
-    const ccl::cluster_device_indices_type& get_affinity_indices() const;
-
-    const ccl::process_aggregated_device_mask_t& get_node_afinity_mask(
-        const ccl::host_id& host) const;
-    const ccl::process_device_indices_type& get_node_afinity_indices(
-        const ccl::host_id& host) const;
-
-    void set_node_afinity_indices(const ccl::host_id& host,
-                                  int rank_id,
-                                  const ccl::device_indices_type& indices);
-
-    const ccl::host_id get_host_id() const;
-
-    std::string to_string() const;
-    device_storage& get_device_storage();
-    std::vector<ccl::device_indices_type> get_ipc_device_indices() const;
-    static std::vector<ccl::device_indices_type> get_ipc_device_indices_for_id(
-        size_t process_idx,
-        ccl::process_device_indices_type node_indices);
-
-    static void dump_cluster_affinity_mask(const ccl::cluster_aggregated_device_mask_t& mask,
-                                           std::ostream& out);
-    static void dump_node_aggregated_mask(const std::string& node_name,
-                                          const ccl::process_aggregated_device_mask_t& mask,
-                                          std::ostream& out);
-    static void dump_process_mask(size_t process_id,
-                                  const ccl::device_mask_t& mask,
-                                  std::ostream& out);
-
-    static void dump_cluster_affinity_indices(const ccl::cluster_device_indices_type& mask,
-                                              std::ostream& out);
-    static void dump_node_aggregated_indices(const std::string& node_name,
-                                             const ccl::process_device_indices_type& mask,
-                                             std::ostream& out);
-    static void dump_process_indices(size_t process_id,
-                                     const ccl::device_indices_type& mask,
-                                     std::ostream& out);
-
-    void dump_process_topologies(std::ostream& out) const;
-    std::unique_ptr<allied_process_group_scheduler> scheduler_impl;
-
-    numa_context_base& get_numa_ctx();
-    const numa_context_base& get_numa_ctx() const;
-    scaleup_context_base& get_scaleup_ctx();
-    const scaleup_context_base& get_scaleup_ctx() const;
-    scaleout_context_base& get_scaleout_ctx();
-    const scaleout_context_base& get_scaleout_ctx() const;
-    ipc_context_base& get_ipc_ctx();
-    const ipc_context_base& get_ipc_ctx() const;
-
-    virtual /*TODO use stub*/
-        void
-        collect_cluster_colored_plain_graphs(
-            const detail::colored_plain_graph_list& send_graph,
-            detail::global_sorted_colored_plain_graphs& received_graphs);
-
-private:
-    bool delegate_sync(const ccl::device_indices_type& thread_device_indices,
-                       ccl::context_comm_addr& comm_addr);
-    bool build_cluster_affinity_table(
-        const ccl::device_indices_type& process_aggregated_device_indices);
-
-    std::shared_ptr<ccl::host_communicator> get_communicator();
-
-    std::shared_ptr<ccl::host_communicator> ccl_communicator;
-    std::shared_ptr<thread_group_context> thread_group_ctx;
-    ccl::host_id my_host_name;
-    ccl::cluster_aggregated_device_mask_t global_mask;
-    ccl::cluster_device_indices_type cluster_gpu_indices;
-
-    //TODO -S- temporary START
-    size_t cluster_device_rank_offset;
-    size_t cluster_device_size;
-    //TODO -S- temporary END
-
-    std::unique_ptr<device_storage> gpu_device_storage;
-    topologies_storage process_device_topology;
-
-    size_t process_idx; //cached
-    size_t process_count; //cached
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/base/base_session.cpp b/src/common/comm/l0/context/scale/base/base_session.cpp
deleted file mode 100644
index f19ba40ca..000000000
--- a/src/common/comm/l0/context/scale/base/base_session.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
-#include "oneapi/ccl/native_device_api/l0/primitives.hpp"
-#include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp"
-
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-
-namespace native {
-namespace observer {
-
-void context_descr::init_host_dev_fields() {
-    host_mem_producer = nullptr;
-    host_mem_producer_counter = nullptr;
-    host_consumed_bytes = 0;
-    host_expected_bytes = 0;
-
-    dev_mem_consumer = nullptr;
-    dev_mem_consumer_counter = nullptr;
-    device_produced_bytes = 0;
-}
-
-void context_descr::init(size_t staged_buffer_elem_count,
-                         size_t observer_domain_index,
-                         size_t observer_domain_count,
-                         std::shared_ptr<ccl_context>& context,
-                         ccl_device& device) {
-    // set all fields by 0
-    init_host_dev_fields();
-
-    /* HOST */
-    // create staged mem in host context (Host memory allocation descriptor)
-    ze_host_mem_alloc_desc_t host_descr = ccl_context::get_default_host_alloc_desc();
-    host_descr.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED;
-
-    // host mem buf
-    host_mem_producer = context->template alloc_memory<uint8_t>(
-        staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype()),
-        /*TODO use page size*/ ccl::get_datatype_size(kernel_params.get_datatype()),
-        host_descr);
-
-    // create staged mem counter in host context (host mem buf counter)
-    host_mem_producer_counter = context->template alloc_memory<counter_t>(
-        1, /*TODO use page size*/ sizeof(counter_t), host_descr);
-
-    host_expected_bytes =
-        staged_buffer_elem_count * ccl::get_datatype_size(kernel_params.get_datatype());
-
-    /* DEVICE */
-    ze_device_mem_alloc_desc_t mem_descr = ccl_device::get_default_mem_alloc_desc();
-
-    // create total aggregated memory in device context
-    mem_descr.flags = 0;
-    dev_mem_consumer = device.template alloc_memory_ptr<uint8_t>(
-        (staged_buffer_elem_count * observer_domain_count) *
-            ccl::get_datatype_size(kernel_params.get_datatype()),
-        ccl::get_datatype_size(kernel_params.get_datatype()),
-        context,
-        mem_descr);
-
-    // create offset in device context
-    mem_descr.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
-    producer_aggregated_memory_offset =
-        device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr);
-
-    // create aggregated counter in device context
-    dev_mem_consumer_counter =
-        device.template alloc_memory_ptr<counter_t>(1, sizeof(counter_t), context, mem_descr);
-
-    /* COUNTERS */
-    reset_counters(observer_domain_index, observer_domain_count);
-}
-
-void context_descr::reset_counters(size_t observer_domain_index, size_t observer_domain_count) {
-    counter_t filled_counter_value = 0;
-
-    host_mem_producer_counter->enqueue_write_sync(&filled_counter_value, 1);
-
-    filled_counter_value = observer_domain_index * host_mem_producer->count();
-
-    producer_aggregated_memory_offset->enqueue_write_sync(&filled_counter_value, 1);
-
-    filled_counter_value = 0;
-    dev_mem_consumer_counter->enqueue_write_sync(&filled_counter_value, 1);
-}
-
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/base/base_session.hpp b/src/common/comm/l0/context/scale/base/base_session.hpp
deleted file mode 100644
index fea9590b7..000000000
--- a/src/common/comm/l0/context/scale/base/base_session.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "oneapi/ccl.hpp"
-#include "oneapi/ccl/native_device_api/l0/device.hpp"
-#include "oneapi/ccl/native_device_api/l0/context.hpp"
-
-#include "coll/algorithms/algorithms_enum.hpp"
-#include "common/comm/l0/modules/supported_modules.hpp"
-#include "coll/coll_param.hpp"
-
-namespace native {
-namespace observer {
-using counter_t = uint64_t;
-
-struct producer_description {
-    size_t rank;
-    size_t comm_size;
-    counter_t staged_buffer_elem_count;
-
-    std::shared_ptr<ccl_context> context;
-    ccl_device& device;
-    ccl_device::device_cmd_list immediate_list; //TODO make persisten
-};
-
-struct context_descr {
-    context_descr(const coll_param_gpu& kernel_params) : kernel_params(kernel_params) {}
-
-    using host_mem_ptr_t = ccl_context::host_memory_ptr<uint8_t>;
-    using host_mem_ptr_cntr_t = ccl_context::host_memory_ptr<counter_t>;
-    using dev_mem_ptr_t = ccl_device::device_memory_ptr<uint8_t>;
-    using dev_mem_ptr_cntr_t = ccl_device::device_memory_ptr<counter_t>;
-
-    // produced by kernel
-    host_mem_ptr_t host_mem_producer;
-    host_mem_ptr_cntr_t host_mem_producer_counter;
-    size_t host_consumed_bytes;
-    size_t host_expected_bytes;
-
-    // consumed by kernel
-    dev_mem_ptr_t dev_mem_consumer;
-    dev_mem_ptr_cntr_t dev_mem_consumer_counter;
-    size_t device_produced_bytes;
-
-    // (TODO consider using 'recv_buff' from collective entry)
-    // to reduce copy iterations
-    // TODO: rename
-    dev_mem_ptr_cntr_t producer_aggregated_memory_offset;
-
-    void init_host_dev_fields();
-
-    void init(size_t staged_buffer_elem_count,
-              size_t observer_domain_index,
-              size_t observer_domain_count,
-              std::shared_ptr<ccl_context>& context,
-              ccl_device& device);
-
-    void reset_counters(size_t observer_domain_index, size_t observer_domain_count);
-
-private:
-    // TODO: can we guarantee that this object is not destroyed before invoke_params and
-    // use const& here?
-    coll_param_gpu kernel_params;
-};
-
-template <ccl_coll_type coll_type>
-struct invoke_params {
-    static constexpr ccl_coll_type get_coll_type() {
-        return coll_type;
-    }
-
-    invoke_params(producer_description&& in_producer_params, const coll_param_gpu& kernel_params)
-            : in_params(std::move(in_producer_params)),
-              kernel_params(kernel_params),
-              out_params(kernel_params),
-              valid(false) {}
-
-    void set_out_params(const context_descr& src) {
-        out_params = src;
-        valid = true;
-    }
-
-    bool is_valid() const {
-        return valid;
-    }
-
-    const producer_description& get_producer_params() const {
-        return in_params;
-    }
-
-    producer_description& get_producer_params() {
-        return in_params;
-    }
-
-    const coll_param_gpu& get_kernel_params() const {
-        return kernel_params;
-    }
-
-    const context_descr& get_ctx_params() const {
-        if (!is_valid()) {
-            throw std::runtime_error("observer invocation params are not ready");
-        }
-        return out_params;
-    }
-
-private:
-    producer_description in_params;
-    // TODO: can we guarantee that this object is not destroyed before l0 entry and
-    // use const& here?
-    coll_param_gpu kernel_params;
-    context_descr out_params;
-    bool valid;
-};
-
-struct session_key {
-    using hash_core_t = size_t;
-
-    friend std::ostream& operator<<(std::ostream& out, const session_key& key) {
-        out << key.to_string();
-        return out;
-    }
-
-    template <class T>
-    session_key(const T* src) : hash(std::hash<const T*>{}(src)) {}
-
-    bool operator<(const session_key& other) const noexcept {
-        return hash < other.hash;
-    }
-
-    std::string to_string() const {
-        return std::to_string(hash);
-    }
-
-private:
-    hash_core_t hash;
-};
-
-struct session_notification {
-    session_notification(void* addr, size_t size_bytes)
-            : host_src_ptr(addr),
-              src_size_bytes(size_bytes) {}
-    void* host_src_ptr;
-    size_t src_size_bytes;
-};
-
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/base/base_session_table.hpp b/src/common/comm/l0/context/scale/base/base_session_table.hpp
deleted file mode 100644
index 574127381..000000000
--- a/src/common/comm/l0/context/scale/base/base_session_table.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-#include <map>
-#include <memory>
-
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-#include "common/comm/l0/modules/supported_modules.hpp"
-
-namespace native {
-namespace observer {
-
-// session owner, not thread-safe
-template <class session_interface>
-struct session_table {
-    using session_key_t = session_key;
-    using session_interface_t = session_interface;
-    using session_interface_ptr_t = std::shared_ptr<session_interface_t>;
-
-    template <template <ccl::device_topology_type, class...> class specific_session,
-              ccl::device_topology_type class_id,
-              class invoke_params_type>
-    session_interface_ptr_t create_session(const session_key_t& key,
-                                           invoke_params_type& params,
-                                           size_t observer_domain_index,
-                                           size_t observer_domain_count) {
-        using specific_session_impl = specific_session<class_id, invoke_params_type>;
-
-        static_assert(std::is_base_of<session_interface_t, specific_session_impl>::value,
-                      "Relationship IS-A `specific_session` for `session_interface_t` failed");
-
-        auto sess = std::make_shared<specific_session_impl>(params.get_producer_params(),
-                                                            params.get_kernel_params(),
-                                                            observer_domain_index,
-                                                            observer_domain_count,
-                                                            key);
-
-        params.set_out_params(sess->get_ctx_descr());
-        sessions.emplace(key, sess);
-
-        return sess;
-    }
-
-    size_t get_unique_tag() {
-        static std::atomic<size_t> tag_counter{ 1 };
-        return tag_counter.fetch_add(1);
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "sessions count: " << sessions.size() << std::endl;
-        for (const auto& val : sessions) {
-            ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n"
-               << val.second->to_string() << std::endl;
-        }
-        return ss.str();
-    }
-
-    std::map<session_key_t, session_interface_ptr_t> sessions{};
-};
-} // namespace observer
-} //namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp
deleted file mode 100644
index fcee3dee6..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <atomic>
-#include <condition_variable>
-#include <list>
-#include <mutex>
-#include <thread>
-#include <vector>
-#include "common/comm/l0/context/base_scaling_ctx.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
-
-namespace ccl {
-class host_communicator;
-}
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-class ccl_ipc_gpu_comm;
-
-template <class device>
-class ccl_ipc_source_gpu_comm;
-
-struct session_table;
-class session;
-
-template <class Impl, ccl::device_topology_type... types>
-class ipc_ctx : public observer::base_scaling_ctx<ipc_ctx<Impl, types...>,
-                                                  ccl_ipc_source_gpu_comm<ccl_gpu_comm>,
-                                                  ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>,
-                                                  ccl_ipc_gpu_comm> {
-public:
-    static_assert(sizeof...(types), "types must be not 0");
-    using context_impl = Impl;
-
-    template <class device_t>
-    using observer_t = ccl_ipc_source_gpu_comm<device_t>;
-
-    using scaling_ctx_base_t = observer::base_scaling_ctx<ipc_ctx<Impl, types...>,
-                                                          observer_t<ccl_gpu_comm>,
-                                                          observer_t<ccl_virtual_gpu_comm>,
-                                                          ccl_ipc_gpu_comm>;
-
-    using observable_ipc_topologies =
-        typename scaling_ctx_base_t::template observable_topologies<types...>;
-
-    using indexed_observable_ipc_topologies =
-        typename scaling_ctx_base_t::template indexed_observable_topologies<types...>;
-
-    observable_ipc_topologies observables;
-    indexed_observable_ipc_topologies indexed_observables;
-
-    ipc_ctx() {}
-
-    ~ipc_ctx() {
-        stop.store(true);
-        delivery_condition.notify_all();
-
-        for (auto& thread : listener_thread_map) {
-            thread.second->join();
-        }
-    }
-
-    void initialize_ctx(std::shared_ptr<ccl::host_communicator> communicator);
-
-    // session data
-    template <class IPC_source_device_t>
-    struct ipc_src_session_data {
-        std::map<IPC_source_device_t*, std::shared_ptr<session_table>> source_sessions;
-    };
-
-    using session_table_t = std::tuple<ipc_src_session_data<observer_t<ccl_gpu_comm>>,
-                                       ipc_src_session_data<observer_t<ccl_virtual_gpu_comm>>>;
-    std::map<ccl_coll_type, session_table_t> collective_sessions;
-
-    //observer subject interface implementations
-    template <class device_t, ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    template <ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             ccl_ipc_gpu_comm* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    template <class device_t, ccl::device_topology_type class_id, class ipc_invoke_params_t>
-    void invoke_ctx_observer(observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, class_id> val,
-                             const ipc_session_key& session_key,
-                             ipc_invoke_params_t&& param) {
-        // sanity - check registered proxy
-        observer::container_t<observer_t<device_t>>& container =
-            scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-                observables);
-
-        auto it = container.find(observer_ptr);
-        if (it == container.end()) {
-            throw std::runtime_error(std::string("Observer is not registered: ") +
-                                     observer_ptr->to_string() +
-                                     " total count: " + std::to_string(container.size()));
-        }
-
-        //Try to find existing session owner for coll type
-        auto coll_session_table_it = collective_sessions.find(ipc_invoke_params_t::get_coll_type());
-        if (coll_session_table_it == collective_sessions.end()) {
-            std::stringstream ss;
-            for (const auto& val : collective_sessions) {
-                ss << ccl_coll_type_to_str(val.first) << ", ";
-            }
-            LOG_ERROR("session_key: ",
-                      session_key.to_string(),
-                      ", cannot find collective session table for key: ",
-                      ccl_coll_type_to_str(ipc_invoke_params_t::get_coll_type()),
-                      ". Available keys: ",
-                      ss.str());
-            abort();
-        }
-
-        auto& sessions_table = ccl_tuple_get<ipc_src_session_data<observer_t<device_t>>>(
-            coll_session_table_it->second);
-        auto session_table_it = sessions_table.source_sessions.find(observer_ptr);
-        if (session_table_it == sessions_table.source_sessions.end()) {
-            std::stringstream ss;
-            ss << "sessions count: " << sessions_table.source_sessions.size() << std::endl;
-            for (const auto& val : sessions_table.source_sessions) {
-                ss << val.first->to_string() << ", " << val.second->to_string() << std::endl;
-            }
-            LOG_ERROR("session_key: ",
-                      session_key.to_string(),
-                      ", cannot find source session for device: ",
-                      observer_ptr->to_string(),
-                      ". Available keys: ",
-                      ss.str());
-            abort();
-        }
-
-        std::shared_ptr<session_table> table = session_table_it->second;
-        if (!table) {
-            LOG_ERROR("session_key: ", session_key.to_string(), ", session table is empty. Abort");
-            abort();
-        }
-
-        // TODO: WA: destroy all sessions that were before
-        // (only one session is always active)
-        // without this WA, we hang in kernels when reusing sessions
-        // because other sessions have the same key accidentally.
-        // It will works for GPU cache enabled but invalid without cache
-        table->sessions.clear();
-
-        std::shared_ptr<session> sess;
-        auto session_it = table->sessions.find(session_key);
-        if (session_it == table->sessions.end()) {
-            LOG_DEBUG("create new session session_key: ",
-                      session_key.to_string(),
-                      ", current sessions count: ",
-                      table->sessions.size());
-            const auto& comm_addr =
-                observer_ptr->template get_comm_data<ccl::group_split_type::cluster,
-                                                     ccl::device_topology_type::ring>();
-
-            size_t rank_peer_addr = comm_addr.rank;
-
-            std::string peer_addr = create_ipc_addr_for_rank(rank_peer_addr);
-            sess = table->create_session<class_id>(
-                session_key, observer_ptr, peer_addr, std::move(param), comm_addr.rank);
-        }
-        else {
-            //renew existing
-            sess = session_it->second;
-            LOG_DEBUG("session reuse: session_key: ",
-                      session_key.to_string(),
-                      ", current sessions count: ",
-                      table->sessions.size());
-        }
-
-        append_session_for_processing(session_key, sess);
-    }
-
-    void send_stop();
-
-private:
-    std::string create_ipc_addr_for_rank(size_t rank) const;
-    template <ccl::device_topology_type topology_type, class device_t>
-    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
-
-    template <ccl::device_topology_type topology_type>
-    void register_observer_impl(size_t rank_addr, ccl_ipc_gpu_comm* observer_ptr);
-
-    std::atomic<bool> stop;
-    std::mutex delivery_mutex;
-    std::condition_variable delivery_condition;
-    std::list<std::shared_ptr<session>> processing_queue;
-
-    std::map<ccl_ipc_gpu_comm*, std::unique_ptr<std::thread>> listener_thread_map;
-
-    void listener(ccl_ipc_gpu_comm* listener_device);
-
-    void append_session_for_processing(const ipc_session_key& session_key,
-                                       std::shared_ptr<session> sess);
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp
deleted file mode 100644
index 434ad7024..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/scale/ipc/ipc_ctx.hpp"
-#include "common/utils/tuple.hpp"
-
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
-#include "common/log/log.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_server.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_connection.hpp"
-
-namespace native {
-
-#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
-#define TEMPLATE_DEF_ARG  Impl, types...
-
-/*
-template <TEMPLATE_DECL_ARG>
-ipc_ctx<TEMPLATE_DEF_ARG>::~ipc_ctx() {
-
-    send_stop();
-    delivery_thread.join();
-}
-*/
-template <TEMPLATE_DECL_ARG>
-void ipc_ctx<TEMPLATE_DEF_ARG>::initialize_ctx(
-    std::shared_ptr<ccl::host_communicator> communicator) {
-    (void)communicator;
-    //send_stop();
-    stop.store(false);
-    LOG_DEBUG("IPC context Initialized for mpi rank: (",
-              std::to_string(communicator->rank()),
-              "/",
-              std::to_string(communicator->size()),
-              ")");
-}
-
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type class_id, class device_t>
-void ipc_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                       observer_t<device_t>* observer_ptr) {
-    LOG_DEBUG(
-        "device rank addr: ", std::to_string(rank_addr), ", device: ", observer_ptr->to_string());
-    observer::container_t<observer_t<device_t>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            observables);
-    auto cont_it = container.find(observer_ptr);
-    if (cont_it == container.end()) {
-        container.insert(observer_ptr);
-
-        // prepare IPC session tables
-        for (size_t i = static_cast<size_t>(ccl_coll_allgatherv);
-             i < static_cast<size_t>(ccl_coll_internal);
-             i++) {
-            ccl_coll_type type = static_cast<ccl_coll_type>(i);
-
-            auto& tuple_sessions = collective_sessions[type];
-            auto& sessions_table =
-                ccl_tuple_get<ipc_src_session_data<observer_t<device_t>>>(tuple_sessions);
-            sessions_table.source_sessions.emplace(
-                observer_ptr, std::make_shared<session_table>(session_table{}));
-        }
-
-        if (rank_addr == std::numeric_limits<size_t>::max()) {
-            return; //nothing to do more
-        }
-    }
-
-    //reassign with index
-    assert(rank_addr != std::numeric_limits<size_t>::max() &&
-           "Reassign with assigned address failed");
-
-    observer::indexed_container_t<observer_t<device_t>>& indexed_container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            indexed_observables);
-
-    auto indexed_it = indexed_container.find(rank_addr);
-    if (indexed_it != indexed_container.end()) {
-        // collect troubleshooting info
-        std::stringstream ss;
-        for (const auto& indexed_dev : indexed_container) {
-            ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string()
-               << "\n";
-        }
-        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                 "- Cannot reassing rank: " + std::to_string(rank_addr) +
-                                 " for device:\n" + observer_ptr->to_string() +
-                                 "\nBecause it registered already:\n" + ss.str());
-    }
-
-    indexed_container.emplace(rank_addr, observer_ptr);
-}
-
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type class_id>
-void ipc_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                       ccl_ipc_gpu_comm* observer_ptr) {
-    LOG_DEBUG("DST device rank addr: ",
-              std::to_string(rank_addr),
-              ", DST device: ",
-              observer_ptr->to_string());
-    observer::container_t<ccl_ipc_gpu_comm>& container =
-        scaling_ctx_base_t::template get_types_container<ccl_ipc_gpu_comm, class_id>(observables);
-    auto cont_it = container.find(observer_ptr);
-    if (cont_it == container.end()) {
-        container.insert(observer_ptr);
-
-        if (rank_addr == std::numeric_limits<size_t>::max()) {
-            return; //nothing to do more
-        }
-    }
-
-    //reassign with index
-    assert(rank_addr != std::numeric_limits<size_t>::max() &&
-           "Reassign with assigned address failed");
-
-    observer::indexed_container_t<ccl_ipc_gpu_comm>& indexed_container =
-        scaling_ctx_base_t::template get_types_container<ccl_ipc_gpu_comm, class_id>(
-            indexed_observables);
-
-    auto indexed_it = indexed_container.find(rank_addr);
-    if (indexed_it != indexed_container.end()) {
-        // collect troubleshooting info
-        std::stringstream ss;
-        for (const auto& indexed_dev : indexed_container) {
-            ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string()
-               << "\n";
-        }
-        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                 "- Cannot reassing rank: " + std::to_string(rank_addr) +
-                                 " for device:\n" + observer_ptr->to_string() +
-                                 "\nBecause it registered already:\n" + ss.str());
-    }
-
-    indexed_container.emplace(rank_addr, observer_ptr);
-
-    //Start IPC server for each DST device for listening incoming conections from SRC devices
-    std::string addr = create_ipc_addr_for_rank(rank_addr);
-    LOG_DEBUG("Start IPC listener for device:\n", observer_ptr->to_string(), "\nAddr: ", addr);
-    try {
-        observer_ptr->start(addr);
-        auto it = listener_thread_map.find(observer_ptr);
-        if (it != listener_thread_map.end()) {
-            throw std::runtime_error(std::string("IPC listener already exist for device: ") +
-                                     observer_ptr->to_string());
-        }
-
-        listener_thread_map.emplace(
-            observer_ptr,
-            new std::thread(&ipc_ctx<TEMPLATE_DEF_ARG>::listener, this, observer_ptr));
-        LOG_DEBUG("Listener thread started on addr: ", addr);
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR("Cannot start IPC listener on: ",
-                  addr,
-                  " for device: ",
-                  observer_ptr->to_string(),
-                  ", error: ",
-                  ex.what());
-        throw;
-    }
-}
-
-template <TEMPLATE_DECL_ARG>
-std::string ipc_ctx<TEMPLATE_DEF_ARG>::create_ipc_addr_for_rank(size_t rank) const {
-    std::string ret;
-
-    //TODO make unique for KVS group
-    ret += "IPC_[";
-    ret += std::to_string(rank);
-    ret += "]";
-    return ret;
-}
-
-template <TEMPLATE_DECL_ARG>
-void ipc_ctx<TEMPLATE_DEF_ARG>::append_session_for_processing(const ipc_session_key& session_key,
-                                                              std::shared_ptr<session> sess) {
-    LOG_DEBUG("session_key: ", session_key.to_string(), ", selected session: ", sess->to_string());
-    {
-        std::lock_guard<std::mutex> lk(delivery_mutex);
-        processing_queue.push_back(sess);
-        delivery_condition.notify_one();
-    }
-}
-
-template <TEMPLATE_DECL_ARG>
-void ipc_ctx<TEMPLATE_DEF_ARG>::send_stop() {
-    stop.store(true);
-    delivery_condition.notify_all();
-}
-
-template <TEMPLATE_DECL_ARG>
-void ipc_ctx<TEMPLATE_DEF_ARG>::listener(ccl_ipc_gpu_comm* listener_device) {
-    LOG_DEBUG("Start IPC context listener worker, Listener device: ", listener_device->to_string());
-
-    // TODO ring only, peer-to-peer case: one SRC connects to one DST
-    std::unique_ptr<net::ipc_rx_connection> incoming_connection;
-    while (!incoming_connection) {
-        try {
-            auto incoming = listener_device->process_connection();
-            if (incoming) {
-                LOG_DEBUG("Got connection on device: ", listener_device->to_string());
-                incoming_connection = std::move(incoming);
-            }
-        }
-        catch (const std::exception& ex) {
-            LOG_DEBUG("Stop requested at serving connection stage");
-            return;
-        }
-
-        if (stop.load()) {
-            LOG_DEBUG("Stop requested at serving connection stage");
-            return;
-        }
-    }
-
-    // processing incoming data from connected clients
-    LOG_DEBUG("Start IPC context processing worker, Listener device: ",
-              listener_device->to_string());
-    while (!stop.load()) {
-        //TODO choose std::list
-        decltype(processing_queue) sessions_to_execute;
-        {
-            std::unique_lock<std::mutex> lk(delivery_mutex);
-            delivery_condition.wait(lk, [this]() {
-                return !processing_queue.empty() || stop.load();
-            });
-
-            sessions_to_execute.splice(sessions_to_execute.end(), processing_queue);
-        }
-
-        LOG_DEBUG("Sessions for processing: ",
-                  sessions_to_execute.size(),
-                  " stop flag status: ",
-                  stop.load());
-        for (auto sess_it = sessions_to_execute.begin();
-             sess_it != sessions_to_execute.end() and !stop.load();) {
-            shared_session_ptr_t sess = *sess_it;
-
-            // try restore IPC handles
-            LOG_DEBUG("process session: ", sess->to_string());
-            if (!sess->process(listener_device, incoming_connection.get())) {
-                ++sess_it;
-                continue;
-            }
-
-            // bind restored IPC handles to kernel
-            sess->visit(listener_device, listener_device->get_registered_modules());
-            LOG_DEBUG("session processed: ", sess->to_string());
-
-            // find next session
-            sess_it = sessions_to_execute.erase(sess_it);
-        }
-    }
-}
-
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
deleted file mode 100644
index baa3b0539..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <sstream>
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
-#include "common/log/log.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-#include "common/comm/l0/devices/ccl_ipc_gpu_comm.hpp"
-
-#include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_connection.hpp"
-
-namespace native {
-
-session::session(origin_ipc_memory_container&& ipc_src_memory_handles,
-                 size_t source_ipc_device_rank)
-        : source_device_rank(source_ipc_device_rank),
-          source_ipc_memory_storage(std::move(ipc_src_memory_handles)) {
-    LOG_DEBUG("Got IPC handles: ", source_ipc_memory_storage.size());
-    for (const auto& h : source_ipc_memory_storage) {
-        LOG_DEBUG("handle: ", native::to_string(h.get()));
-    }
-
-    //to match recevied messag
-    send_tag = session_table::get_unique_tag();
-    finished.store(false);
-}
-
-session::~session() {}
-
-void session::start(net::ipc_client* client, const std::string& addr) {
-    if (!client) {
-        LOG_ERROR("Empty client for addr: ", addr);
-        abort();
-    }
-
-    if (finished.load()) {
-        //No need ask handles again
-        LOG_DEBUG("session: ", reinterpret_cast<void*>(this), ", finished already");
-        return;
-    }
-
-    //create connection and send data
-    std::shared_ptr<net::ipc_tx_connection> tx_connection = client->create_connection(addr);
-
-    // send tag and device rank additionally
-    std::array<uint8_t, sizeof(source_device_rank) + sizeof(send_tag)> payload{};
-    *reinterpret_cast<size_t*>(payload.data()) = source_device_rank;
-    *reinterpret_cast<size_t*>(payload.data() + sizeof(source_device_rank)) = send_tag;
-    source_ipc_raw_data = tx_connection->send_ipc_memory_ext(
-        source_ipc_memory_storage, payload.data(), payload.size());
-
-    //TODO RING only: peer-to-peer
-    data_to_recover.raw_data.resize(source_ipc_raw_data.size());
-    LOG_DEBUG("Rank: ",
-              source_device_rank,
-              ", prepared IPC handles exhange bytes for receive:",
-              data_to_recover.raw_data.size());
-}
-
-bool session::process(const ccl_ipc_gpu_comm* indexed_ipc_dst_devices,
-                      const net::ipc_rx_connection* incoming_connection) {
-    size_t existing_recovered_ipc_size = data_to_recover.ipc_memory_storage.size();
-    LOG_DEBUG("session: ",
-              reinterpret_cast<void*>(this),
-              ", recovered ipc storage size: ",
-              existing_recovered_ipc_size);
-    if (existing_recovered_ipc_size and finished.load()) {
-        return true;
-    }
-
-    //wait data
-    size_t received_rank = 0;
-    size_t received_tag = 0;
-    size_t handles_data_offset = sizeof(received_rank) + sizeof(received_tag);
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> handles =
-        incoming_connection->receive_ipc_memory_ext(data_to_recover.raw_data, handles_data_offset);
-
-    //TODO get tag
-    received_rank = *reinterpret_cast<size_t*>(data_to_recover.raw_data.data());
-    received_tag =
-        *reinterpret_cast<size_t*>(data_to_recover.raw_data.data() + sizeof(source_device_rank));
-    (void)received_tag;
-
-    std::shared_ptr<ccl_context> ctx;
-
-    //restore handles
-    size_t num_handles = 0;
-    for (auto& recv_ip_handle : handles) {
-        std::shared_ptr<ccl_device> owner_device = recv_ip_handle->get_owner().lock();
-        LOG_DEBUG("Found IPC owner comm device: ",
-                  indexed_ipc_dst_devices->to_string(),
-                  ",\nIPC handle:\n",
-                  native::to_string(recv_ip_handle->get()));
-
-        try {
-            // restore IPC memory object from comm device
-            auto restored = owner_device->get_ipc_memory(std::move(recv_ip_handle), ctx);
-            data_to_recover.ipc_memory_storage[indexed_ipc_dst_devices].push_back(
-                std::move(restored));
-            LOG_DEBUG("IPC handle by index: ", num_handles, " restored");
-        }
-        catch (const std::exception& ex) {
-            LOG_ERROR("Cannot recover IPC handle by index: ", num_handles, ", error:\n", ex.what());
-            throw;
-        }
-        num_handles++;
-    }
-
-    // handles received
-    finished.store(true);
-    return true;
-}
-
-std::string session::to_string() const {
-    std::stringstream ss;
-    ss << "tag: " << send_tag << ", src_dev_rank: " << source_device_rank
-       << ", src_raw_size: " << source_ipc_raw_data.size()
-       << ", handles cnt: " << source_ipc_memory_storage.size()
-       << ", data_recover: " << data_to_recover.ipc_memory_storage.size()
-       << ", is finished: " << finished.load();
-
-    return ss.str();
-}
-
-size_t session::get_send_tag() const {
-    return send_tag;
-}
-
-void session_table::start_session(std::shared_ptr<session> sess,
-                                  net::ipc_client* client,
-                                  const std::string& peer_addr) {
-    sess->start(client, peer_addr);
-}
-
-size_t session_table::get_unique_tag() {
-    static std::atomic<size_t> tag_counter{ 1 };
-    return tag_counter.fetch_add(1);
-}
-
-std::string session_table::to_string() const {
-    std::stringstream ss;
-    ss << "sessions count: " << sessions.size() << std::endl;
-    for (const auto& val : sessions) {
-        ss << "[" << val.first << ", " << reinterpret_cast<void*>(val.second.get()) << "]\n"
-           << val.second->to_string() << std::endl;
-    }
-    return ss.str();
-}
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp
deleted file mode 100644
index 73e015f8b..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx_session.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-#include <map>
-#include <memory>
-#include "coll/coll_param.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
-#include "common/comm/l0/modules/supported_modules.hpp"
-
-namespace ccl {
-class host_communicator;
-}
-
-namespace net {
-class ipc_client;
-class ipc_rx_connection;
-} // namespace net
-
-namespace native {
-class ccl_ipc_gpu_comm;
-
-/* Low levels session
- * contains raw data for net operations
- */
-class session {
-public:
-    using raw_data_t = std::vector<uint8_t>;
-    using origin_ipc_memory_container = std::vector<ccl_device::device_ipc_memory_handle>;
-
-    session(origin_ipc_memory_container&& ipc_src_momory_handles, size_t source_ipc_device_rank);
-    virtual ~session();
-
-    session(const session& src) = delete;
-    session& operator=(const session& src) = delete;
-
-    struct recovered_handles_storage {
-        using restored_ipc_memory_container = std::vector<ccl_device::device_ipc_memory>;
-
-        raw_data_t raw_data;
-
-        std::map<const ccl_ipc_gpu_comm*, restored_ipc_memory_container> ipc_memory_storage;
-    };
-
-    void start(net::ipc_client* client, const std::string& addr);
-
-    bool process(const ccl_ipc_gpu_comm* indexed_ipc_dst_devices,
-                 const net::ipc_rx_connection* incoming_connection);
-
-    size_t get_send_tag() const;
-    std::string to_string() const;
-
-    virtual void visit(
-        const ccl_ipc_gpu_comm* source,
-        native::supported_device_modules<ipc_dst_device_coll_module>& ipc_modules) = 0;
-
-protected:
-    size_t source_device_rank{};
-    raw_data_t source_ipc_raw_data;
-    origin_ipc_memory_container source_ipc_memory_storage;
-    recovered_handles_storage data_to_recover;
-
-    size_t send_tag{};
-    std::atomic<bool> finished;
-};
-
-using shared_session_ptr_t = std::shared_ptr<session>;
-
-/* High level session
- * Contains collective communication data
- */
-template <ccl_coll_type coll_type, ccl::device_topology_type class_id>
-struct typed_ipc_session : public session {
-    typed_ipc_session(origin_ipc_memory_container&& ipc_src_memory_handles,
-                      size_t source_ipc_device_rank,
-                      const coll_param_gpu& kernel_params)
-            : session(std::move(ipc_src_memory_handles), source_ipc_device_rank),
-              kernel_params(kernel_params) {}
-
-    void visit(const ccl_ipc_gpu_comm* source,
-               native::supported_device_modules<ipc_dst_device_coll_module>& ipc_modules) override {
-        //get appropriate module
-        using module_t =
-            ipc_dst_device_coll_module<coll_type, ccl::group_split_type::cluster, class_id>;
-        std::shared_ptr<module_t>& module_ptr = std::get<::utils::enum_to_underlying(class_id)>(
-            std::get<::utils::enum_to_underlying(ccl::group_split_type::cluster)>(
-                std::get<coll_type>(ipc_modules)));
-        assert(module_ptr);
-
-        // get appropriate kernel
-        auto& kernel =
-            module_ptr->template get_class<typename module_t::main_class>().get(kernel_params);
-
-        // get recovered ipc handles
-        auto data_it = data_to_recover.ipc_memory_storage.find(source);
-        if (data_it == data_to_recover.ipc_memory_storage.end()) {
-            abort();
-        }
-
-        // bind data
-        const auto& ipc_handles = data_it->second;
-        kernel.bind_data(ipc_handles);
-    }
-
-    coll_param_gpu kernel_params;
-};
-
-// session owner
-// TODO not thread-safe
-struct session_table {
-    using session_key_t = ipc_session_key;
-
-    template <ccl::device_topology_type class_id, class ipc_invoke_params_type>
-    std::shared_ptr<session> create_session(const session_key_t& key,
-                                            net::ipc_client* client,
-                                            const std::string& peer_addr,
-                                            ipc_invoke_params_type&& params,
-                                            size_t source_device_rank) {
-        using specific_session =
-            typed_ipc_session<ipc_invoke_params_type::get_coll_type(), class_id>;
-        auto sess = std::make_shared<specific_session>(
-            std::move(params.handles), source_device_rank, params.get_kernel_params());
-        sessions.emplace(key, sess);
-
-        start_session(sess, client, peer_addr);
-        return sess;
-    }
-
-    std::string to_string() const;
-    std::map<session_key_t, shared_session_ptr_t> sessions{};
-
-    static size_t get_unique_tag();
-
-private:
-    void start_session(std::shared_ptr<session> sess,
-                       net::ipc_client* client,
-                       const std::string& peer_addr);
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
deleted file mode 100644
index 0233e7faf..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/log/log.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp"
-#include "common/comm/l0/devices/devices_declaration.hpp"
-
-namespace native {
-namespace utils {
-size_t serialize_ipc_handles(const std::vector<ccl_device::device_ipc_memory_handle>& in_ipc_memory,
-                             std::vector<uint8_t>& out_raw_data,
-                             size_t out_raw_data_initial_offset_bytes) {
-    // serialize data for native allgather algo
-    out_raw_data.clear();
-    constexpr size_t handle_size = ccl_device::device_ipc_memory_handle::get_size_for_serialize();
-
-    size_t send_bytes =
-        handle_size * in_ipc_memory.size() +
-        out_raw_data_initial_offset_bytes; //ipc_data + out_raw_data_initial_offset_bytes
-    out_raw_data.resize(send_bytes);
-
-    // fill send_buf
-    size_t serialize_offset = out_raw_data_initial_offset_bytes;
-    for (const auto& ipc_handle : in_ipc_memory) {
-        serialize_offset += ipc_handle.serialize(out_raw_data, serialize_offset);
-    }
-
-    CCL_ASSERT(serialize_offset == send_bytes,
-               "Expected data to send and actually serialized are differ");
-
-    return send_bytes;
-}
-} // namespace utils
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp b/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp
deleted file mode 100644
index e97c3d2f9..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_ctx_utils.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <vector>
-#include "oneapi/ccl/native_device_api/l0/device.hpp"
-
-namespace native {
-namespace utils {
-size_t serialize_ipc_handles(const std::vector<ccl_device::device_ipc_memory_handle>& in_ipc_memory,
-                             std::vector<uint8_t>& out_raw_data,
-                             size_t out_raw_data_initial_offset_bytes);
-/*
-    size_t deserialize_ipc_handles(const std::vector<ccl_device::device_ipc_memory_handle>& in_ipc_memory,
-                                 std::vector<uint8_t>& out_raw_data,
-                                 size_t out_raw_data_initial_offset_bytes);*/
-} // namespace utils
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp
deleted file mode 100644
index 6acabdfa8..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_session_key.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
-
-namespace native {
-
-bool ipc_session_key::operator<(const ipc_session_key& other) const noexcept {
-    return hash < other.hash;
-}
-
-std::string ipc_session_key::to_string() const {
-    return std::to_string(hash);
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp b/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp
deleted file mode 100644
index 42e36548e..000000000
--- a/src/common/comm/l0/context/scale/ipc/ipc_session_key.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "oneapi/ccl/native_device_api/l0/device.hpp"
-#include "coll/algorithms/algorithms_enum.hpp"
-#include "coll/coll_param.hpp"
-
-namespace native {
-
-template <ccl_coll_type type>
-struct ipc_invoke_params {
-    ipc_invoke_params(std::vector<ccl_device::device_ipc_memory_handle>&& h,
-                      const coll_param_gpu& params)
-            : handles(std::move(h)),
-              params{ params } {}
-
-    static constexpr ccl_coll_type get_coll_type() {
-        return type;
-    }
-
-    const coll_param_gpu& get_kernel_params() const {
-        return params;
-    }
-
-    std::vector<ccl_device::device_ipc_memory_handle> handles;
-    // TODO: can we guarantee that this object is not destroyed before l0 entry and
-    // use const& here?
-    coll_param_gpu params;
-};
-
-struct ipc_session_key {
-    using hash_core_t = size_t;
-
-    friend std::ostream& operator<<(std::ostream& out, const ipc_session_key& key) {
-        out << key.to_string();
-        return out;
-    }
-
-    template <class T>
-    ipc_session_key(const T* src) : hash(std::hash<const T*>{}(src)) {}
-
-    bool operator<(const ipc_session_key& other) const noexcept;
-
-    std::string to_string() const;
-
-private:
-    hash_core_t hash;
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/numa/numa_ctx.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx.hpp
deleted file mode 100644
index 7a1b30ae0..000000000
--- a/src/common/comm/l0/context/scale/numa/numa_ctx.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/base_scaling_ctx.hpp"
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-#include "common/comm/l0/context/scale/base/base_session_table.hpp"
-#include "common/comm/l0/context/scale/numa/numa_session.hpp"
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-template <class device>
-class ccl_numa_proxy;
-
-#define NUMA_CTX_DEVICE_PROXY_TYPES(observer_type) \
-    observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm>
-
-template <class Impl, ccl::device_topology_type... types>
-class numa_ctx : public observer::base_scaling_ctx<numa_ctx<Impl, types...>,
-                                                   NUMA_CTX_DEVICE_PROXY_TYPES(ccl_numa_proxy)> {
-public:
-    static_assert(sizeof...(types), "types must be not 0");
-    using context_impl = Impl;
-
-    template <class device_t>
-    using observer_t = ccl_numa_proxy<device_t>;
-
-    using scaling_ctx_base_t = observer::base_scaling_ctx<numa_ctx<Impl, types...>,
-                                                          NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>;
-
-    using session_t = observer::numa_session_iface; //TODO: numa_session
-    using session_ptr_t = std::shared_ptr<session_t>;
-    using base_session_table_t = observer::session_table<session_t>;
-    using base_session_table_ptr_t = std::shared_ptr<base_session_table_t>;
-
-    using numa_actor = observer::subscribed_actor<session_ptr_t, observer::session_notification>;
-
-    using observable_scale_up_topologies =
-        typename scaling_ctx_base_t::template observable_topologies<types...>;
-
-    using indexed_observable_topologies =
-        typename scaling_ctx_base_t::template indexed_observable_topologies<types...>;
-
-    observable_scale_up_topologies observables;
-    indexed_observable_topologies indexed_observables;
-
-    // session data
-    template <class NUMA_source_device_t, ccl_coll_type coll_type>
-    struct device_session_data {
-        std::map<NUMA_source_device_t*, base_session_table_ptr_t> source_sessions;
-    };
-
-    //TODO make table PER thread!!!
-    template <ccl_coll_type coll_type, class... devices_types>
-    using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>;
-
-    template <ccl_coll_type... coll_type>
-    using session_table_typed_storage_t =
-        std::tuple<session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>...>;
-
-    struct session_table_initializer {
-        template <ccl_coll_type coll_type, class device_t>
-        void operator()(session_table_t<coll_type, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>& table,
-                        observer_t<device_t>* observer_ptr) {
-            auto& sessions_table =
-                ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table);
-            sessions_table.source_sessions.emplace(
-                observer_ptr, std::make_shared<base_session_table_t>(base_session_table_t{}));
-        }
-    };
-
-    session_table_typed_storage_t<CCL_COLL_LIST> collective_sessions;
-
-    //observer subject interface implementations
-    template <class device_t, ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    template <class device_t, ccl::device_topology_type class_id, class numa_invoke_params_t>
-    void invoke_ctx_observer(observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, class_id> val,
-                             const observer::session_key& sess_key,
-                             numa_invoke_params_t& param) {
-        throw std::runtime_error("TODO - NUMA use-case is not implemented");
-
-        // sanity - check registered proxy
-        observer::container_t<observer_t<device_t>>& container =
-            scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-                observables);
-
-        auto it = container.find(observer_ptr);
-        if (it == container.end()) {
-            throw std::runtime_error(std::string("Observer is not registered: ") +
-                                     observer_ptr->to_string() +
-                                     " total count: " + std::to_string(container.size()));
-        }
-        size_t registered_index = std::distance(container.begin(), it);
-
-        static constexpr ccl_coll_type coll_type = numa_invoke_params_t::get_coll_type();
-        //Try to find existing session owner for coll type
-        auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(
-            std::get<coll_type>(collective_sessions));
-
-        // In general way sessions_table.source_sessions.find(observer_ptr) has multithreading access,
-        // But it has write access only in wire up-phase, when observers are inserted from topology construction
-        // Multithreading access here is served by model "multiple-readers - no writers"
-        // and can be used without mutex protection
-        auto session_table_it = sessions_table.source_sessions.find(observer_ptr);
-        if (session_table_it == sessions_table.source_sessions.end()) {
-            std::stringstream ss;
-            ss << "sessions count: " << sessions_table.source_sessions.size() << std::endl;
-            for (const auto& val : sessions_table.source_sessions) {
-                ss << val.first->to_string() << ", " << val.second->to_string() << std::endl;
-            }
-            LOG_ERROR("session_key: ",
-                      sess_key.to_string(),
-                      ", cannot find source session for device: ",
-                      observer_ptr->to_string(),
-                      ". Available keys: ",
-                      ss.str());
-            abort();
-        }
-
-        base_session_table_ptr_t table = session_table_it->second;
-        if (!table) {
-            LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort");
-            abort();
-        }
-
-        session_ptr_t sess;
-        LOG_DEBUG("session_key: ",
-                  sess_key.to_string(),
-                  ", current sessions count: ",
-                  table->sessions.size());
-        auto session_it = table->sessions.find(sess_key);
-        if (session_it == table->sessions.end()) {
-            //create new session
-            sess = table->template create_session<observer::numa_session, class_id>(
-                sess_key, param, registered_index, registered_devices_count);
-        }
-        else {
-            //renew existing
-            sess = session_it->second;
-            sess->prepare(
-                registered_index, registered_devices_count, reinterpret_cast<void*>(&param));
-
-            //param.reset_staged_counters(registered_index, container.size());
-        }
-
-        // notify actor-owner
-        const auto& thread_map =
-            ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, numa_actor>>(
-                numa_workers);
-        auto actor_it = thread_map.find(observer_ptr);
-        if (actor_it == thread_map.end()) {
-            LOG_ERROR("Unregistered observer: ",
-                      observer_ptr->to_string(),
-                      ", thread_map size: ",
-                      thread_map.size(),
-                      " . Abort");
-            abort();
-        }
-
-        actor_it->second->start_job(sess);
-    }
-
-private:
-    template <ccl::device_topology_type topology_type, class device_t>
-    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
-
-    using specific_device_tuple_thread_map_t =
-        observer::multiple_device_thread_map_t<numa_actor, NUMA_CTX_DEVICE_PROXY_TYPES(observer_t)>;
-    specific_device_tuple_thread_map_t numa_workers;
-
-    template <class device_t>
-    void worker(observer_t<device_t>* device,
-                numa_actor* actor_ptr,
-                typename numa_actor::storage_t& todo_list);
-    size_t registered_devices_count{};
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp b/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp
deleted file mode 100644
index 9c4deb598..000000000
--- a/src/common/comm/l0/context/scale/numa/numa_ctx_impl.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
-#include "common/utils/tuple.hpp"
-#include "common/log/log.hpp"
-
-namespace native {
-
-#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
-#define TEMPLATE_DEF_ARG  Impl, types...
-
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type class_id, class device_t>
-void numa_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                        observer_t<device_t>* observer_ptr) {
-    LOG_DEBUG(
-        "device rank addr: ", std::to_string(rank_addr), ", device: ", observer_ptr->to_string());
-    observer::container_t<observer_t<device_t>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            observables);
-    auto cont_it = container.find(observer_ptr);
-    if (cont_it == container.end()) {
-        container.insert(observer_ptr);
-
-        // remember total count
-        registered_devices_count++;
-
-        // prepare session tables
-        session_table_initializer init;
-        ccl_tuple_for_each_args(collective_sessions, init, observer_ptr);
-
-        if (rank_addr == std::numeric_limits<size_t>::max()) {
-            return; //nothing to do more
-        }
-    }
-
-    //reassign with index
-    assert(rank_addr != std::numeric_limits<size_t>::max() &&
-           "Reassign with assigned address failed");
-
-    observer::indexed_container_t<observer_t<device_t>>& indexed_container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            indexed_observables);
-
-    auto indexed_it = indexed_container.find(rank_addr);
-    if (indexed_it != indexed_container.end()) {
-        // collect troubleshooting info
-        std::stringstream ss;
-        for (const auto& indexed_dev : indexed_container) {
-            ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string()
-               << "\n";
-        }
-        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                 "- Cannot reassing rank: " + std::to_string(rank_addr) +
-                                 " for NUMA device:\n" + observer_ptr->to_string() +
-                                 "\nBecause it registered already:\n" + ss.str());
-    }
-
-    indexed_container.emplace(rank_addr, observer_ptr);
-
-    // start NUMA worker
-    auto& thread_map =
-        ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, numa_actor>>(numa_workers);
-    {
-        std::unique_ptr<numa_actor> new_actor{ new numa_actor(
-            rank_addr, &numa_ctx<TEMPLATE_DEF_ARG>::worker<device_t>, this, observer_ptr) };
-        observer::detail::actor_visitor visitor;
-        ccl_tuple_for_each_args(numa_workers, visitor, new_actor.get());
-        thread_map[observer_ptr] = std::move(new_actor);
-    }
-}
-
-template <TEMPLATE_DECL_ARG>
-template <class device_t>
-void numa_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device,
-                                        numa_actor* actor_ptr,
-                                        typename numa_actor::storage_t& todo_list) {
-    size_t total_actors_count = actor_ptr->get_subscriptions_count();
-
-    LOG_DEBUG("Start NUMA context worker, Listener device: ",
-              listener_device->to_string(),
-              ",\nactor_id: ",
-              actor_ptr->get_id(),
-              ", total_actors_count: ",
-              total_actors_count,
-              ",\ntotal devices: ",
-              registered_devices_count,
-              ",\ntodo list size: ",
-              todo_list.size());
-
-    for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) {
-        void* partial_chunk = nullptr;
-        size_t partial_chunk_size = 0;
-
-        // get own device partial chunk data
-        (*sess_it)->produce_data(&partial_chunk, partial_chunk_size);
-        if (partial_chunk_size > 0) {
-            // notify other actor for data_ready
-            observer::detail::actor_publisher<session_ptr_t, observer::session_notification>
-                visitor;
-            ccl_tuple_for_each_args(numa_workers,
-                                    visitor,
-                                    (*sess_it)->get_send_tag(),
-                                    actor_ptr->get_id(),
-                                    partial_chunk,
-                                    partial_chunk_size);
-        }
-
-        // consume data_ready from other actor: starting from myself(!)
-        bool session_finished = false;
-        for (size_t actor_index = actor_ptr->get_id();
-             actor_index < actor_ptr->get_id() + total_actors_count;
-             actor_index++) {
-            std::list<typename numa_actor::mailbox_message_t> messages;
-            actor_ptr->get_mailbox_messages(
-                actor_index % total_actors_count, (*sess_it)->get_send_tag(), messages);
-
-            for (auto mess_it = messages.begin(); mess_it != messages.end(); ++mess_it) {
-                (*sess_it)->consume_data(
-                    0 /*TODO !!!! */, mess_it->host_src_ptr, mess_it->src_size_bytes);
-                session_finished = (*sess_it)->is_consumed();
-                assert(not(session_finished && std::next(mess_it, 1) != messages.end()) &&
-                       "Session are filled too early");
-            }
-        }
-
-        if (session_finished) {
-            sess_it = todo_list.erase(sess_it);
-            //TODO invoke BCast !!!
-        }
-        else {
-            ++sess_it;
-        }
-    }
-}
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/numa/numa_session.hpp b/src/common/comm/l0/context/scale/numa/numa_session.hpp
deleted file mode 100644
index d7f6f799a..000000000
--- a/src/common/comm/l0/context/scale/numa/numa_session.hpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-
-namespace native {
-namespace observer {
-
-class numa_session_iface {
-public:
-    numa_session_iface(session_key key) : sess_key(key) {}
-    virtual ~numa_session_iface() = default;
-
-    size_t get_send_tag() const {
-        return send_tag;
-    }
-
-    const session_key& get_session_key() const {
-        return sess_key;
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "session key identifier: " << get_session_key();
-        return ss.str();
-    }
-
-    virtual void prepare(size_t observer_domain_index,
-                         size_t observer_domain_count,
-                         void* type_erased_param) = 0;
-
-    virtual void produce_data(void** out_chunk, size_t& out_chunk_size) = 0;
-    virtual void consume_data(size_t observer_domain_index,
-                              void* in_chunk,
-                              size_t in_chunk_size) = 0;
-    virtual bool is_consumed() noexcept = 0;
-    virtual bool is_produced() noexcept = 0;
-
-private:
-    size_t send_tag{};
-    session_key sess_key;
-};
-
-/* High level session
- * Contains collective communication data
- */
-template <ccl::device_topology_type class_id, class session_invoke_params>
-struct numa_session : public numa_session_iface {
-    using invoke_params_t = session_invoke_params;
-    using session_key_t = session_key;
-
-    numa_session(producer_description& in_param,
-                 const coll_param_gpu& kernel_params,
-                 size_t observer_domain_index,
-                 size_t observer_domain_count,
-                 const session_key_t& key)
-            : numa_session_iface(key),
-              kernel_params(kernel_params),
-              ctx_descr(kernel_params),
-              copy_immediate_list(std::move(in_param.immediate_list)) {
-        ctx_descr.init(in_param.staged_buffer_elem_count,
-                       observer_domain_index,
-                       observer_domain_count,
-                       in_param.context,
-                       in_param.device);
-    }
-
-    context_descr& get_ctx_descr() {
-        return ctx_descr;
-    }
-
-    const coll_param_gpu& get_kernel_params() const {
-        return kernel_params;
-    }
-
-    void prepare(size_t observer_domain_index,
-                 size_t observer_domain_count,
-                 void* type_erased_param) override {
-        auto* out_param = static_cast<invoke_params_t*>(type_erased_param);
-        ctx_descr.reset_counters(observer_domain_index, observer_domain_count);
-
-        out_param->set_out_params(ctx_descr);
-    }
-
-    void produce_data(void** out_chunk, size_t& out_chunk_size) override {
-        size_t old_consumed = get_ctx_descr().host_consumed_bytes;
-        uint64_t total_produced = *get_ctx_descr().host_mem_producer_counter->get();
-
-        size_t to_consume = total_produced - old_consumed;
-        if (to_consume) {
-            //fence
-            LOG_TRACE(to_string(),
-                      " - bytes produced: ",
-                      total_produced,
-                      ", previously bytes consumed: ",
-                      old_consumed);
-            std::atomic_thread_fence(std::memory_order::memory_order_seq_cst); // TODO: why?
-
-            // do not read data here!
-            *out_chunk =
-                static_cast<void*>(get_ctx_descr().host_mem_producer->get() + old_consumed);
-
-            // update host_consumed_bytes
-            get_ctx_descr().host_consumed_bytes += to_consume;
-        }
-
-        // TODO: set logging here
-        out_chunk_size = to_consume;
-    }
-
-    void consume_data(size_t observer_domain_index, void* in_chunk, size_t in_chunk_size) override {
-        /* TODO create event
-         * ze_event_handle_t mem_event {};
-         */
-
-        auto device_consumer_ready_bytes = get_ctx_descr().dev_mem_consumer_counter->get();
-        auto device_produced_bytes = get_ctx_descr().device_produced_bytes;
-
-        // TODO: set logging here
-
-        // copy buffer from host to device
-        ze_result_t res = zeCommandListAppendMemoryCopy(
-            copy_immediate_list.get(),
-            static_cast<void*>(get_ctx_descr().dev_mem_consumer->get() + device_produced_bytes),
-            in_chunk,
-            in_chunk_size,
-            /*mem_event*/ nullptr,
-            0,
-            nullptr);
-        if (res != ZE_RESULT_SUCCESS) {
-            throw std::runtime_error(
-                std::string(
-                    "cannot append copy NUMA host to device memory for partial result, error: ") +
-                native::to_string(res));
-        }
-        device_produced_bytes += in_chunk_size;
-        get_ctx_descr().device_produced_bytes = device_produced_bytes;
-
-        // TODO: set logging here
-        // copy size from host to device
-        res = zeCommandListAppendMemoryCopy(copy_immediate_list.get(),
-                                            device_consumer_ready_bytes,
-                                            &device_produced_bytes,
-                                            sizeof(device_produced_bytes),
-                                            nullptr,
-                                            0,
-                                            /*&mem_event*/ nullptr);
-        if (res != ZE_RESULT_SUCCESS) {
-            throw std::runtime_error(
-                std::string(
-                    "cannot append copy NUMA host to device memory for ready bytes, error: ") +
-                native::to_string(res));
-        }
-    }
-
-    bool is_consumed() noexcept override {
-        return (get_ctx_descr().device_produced_bytes *
-                ccl::get_datatype_size(get_kernel_params().get_datatype())) ==
-               get_ctx_descr().host_consumed_bytes;
-    }
-
-    bool is_produced() noexcept override {
-        return get_ctx_descr().host_expected_bytes == get_ctx_descr().host_consumed_bytes;
-    }
-
-private:
-    coll_param_gpu kernel_params;
-    context_descr ctx_descr;
-    ccl_device::device_cmd_list copy_immediate_list;
-};
-
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp
deleted file mode 100644
index 7d1302c33..000000000
--- a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/base_scaling_ctx.hpp"
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp"
-#include "common/comm/l0/context/scale/base/base_session_table.hpp"
-
-namespace ccl {
-class host_communicator;
-}
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-template <class device>
-class ccl_scaleout_proxy;
-
-template <class device>
-class ccl_gpu_scaleup_proxy;
-
-template <class device>
-class ccl_numa_proxy;
-
-#define SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_type) \
-    observer_type<ccl_gpu_comm>, observer_type<ccl_virtual_gpu_comm>, \
-        observer_type<ccl_numa_proxy<ccl_gpu_comm>>, \
-        observer_type<ccl_numa_proxy<ccl_virtual_gpu_comm>>, \
-        observer_type<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, \
-        observer_type<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, \
-        observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, \
-        observer_type<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>
-
-template <class Impl, ccl::device_topology_type... types>
-class scale_out_ctx
-        : public observer::base_scaling_ctx<scale_out_ctx<Impl, types...>,
-                                            SCALE_OUT_CTX_DEVICE_PROXY_TYPES(ccl_scaleout_proxy)> {
-public:
-    using context_impl = Impl;
-
-    template <class device_t>
-    using observer_t = ccl_scaleout_proxy<device_t>;
-
-    using scaling_ctx_base_t =
-        observer::base_scaling_ctx<scale_out_ctx<Impl, types...>,
-                                   SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>;
-
-    using session_t = observer::scale_out_session_iface;
-    using session_ptr_t = std::shared_ptr<session_t>;
-    using spec_session_table_t = observer::session_table<session_t>;
-    using spec_session_table_ptr_t = std::shared_ptr<spec_session_table_t>;
-
-    using scaleout_actor = observer::actor<session_ptr_t>;
-
-    using observable_scale_up_topologies =
-        typename scaling_ctx_base_t::template observable_topologies<types...>;
-    using indexed_observable_topologies =
-        typename scaling_ctx_base_t::template indexed_observable_topologies<types...>;
-
-    observable_scale_up_topologies observables;
-    indexed_observable_topologies indexed_observables;
-
-    // session data
-    template <class scaleout_source_device_t, ccl_coll_type coll_type>
-    struct device_session_data {
-        std::map<scaleout_source_device_t*, spec_session_table_ptr_t> source_sessions;
-    };
-
-    //TODO make table PER thread!!!
-    template <ccl_coll_type coll_type, class... devices_types>
-    using session_table_t = std::tuple<device_session_data<devices_types, coll_type>...>;
-
-    template <ccl_coll_type... coll_type>
-    using session_table_typed_storage_t =
-        std::tuple<session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>...>;
-
-    struct session_table_initializer {
-        template <ccl_coll_type coll_type, class device_t>
-        void operator()(
-            session_table_t<coll_type, SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>& table,
-            observer_t<device_t>* observer_ptr) {
-            auto& sessions_table =
-                ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(table);
-            sessions_table.source_sessions.emplace(
-                observer_ptr, std::make_shared<spec_session_table_t>(spec_session_table_t{}));
-        }
-    };
-
-    session_table_typed_storage_t<CCL_COLL_LIST> collective_sessions;
-
-    void initialize_ctx(std::shared_ptr<ccl::host_communicator> communicator);
-
-    //observer subject interface implementations
-    template <class device_t, ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    template <class device_t, ccl::device_topology_type class_id, class invoke_params_t>
-    void invoke_ctx_observer(observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, class_id> val,
-                             const observer::session_key& sess_key,
-                             invoke_params_t& param) {
-        // sanity - check registered proxy
-        observer::container_t<observer_t<device_t>>& container =
-            scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-                observables);
-
-        auto it = container.find(observer_ptr);
-        if (it == container.end()) {
-            throw std::runtime_error(std::string("ScaleOut Observer is not registered: ") +
-                                     observer_ptr->to_string() +
-                                     " total count: " + std::to_string(container.size()));
-        }
-        size_t registered_index = std::distance(container.begin(), it);
-
-        static constexpr ccl_coll_type coll_type = invoke_params_t::get_coll_type();
-        //Try to find existing session owner for coll type
-        auto& sessions_table = ccl_tuple_get<device_session_data<observer_t<device_t>, coll_type>>(
-            std::get<coll_type>(collective_sessions));
-        auto session_table_it = sessions_table.source_sessions.find(observer_ptr);
-        if (session_table_it == sessions_table.source_sessions.end()) {
-            std::stringstream ss;
-            ss << "sessions count: " << sessions_table.source_sessions.size() << std::endl;
-            for (const auto& val : sessions_table.source_sessions) {
-                ss << val.first->to_string() << ", " << val.second->to_string() << std::endl;
-            }
-            LOG_ERROR("session_key: ",
-                      sess_key.to_string(),
-                      ", cannot find source session for device: ",
-                      observer_ptr->to_string(),
-                      ". Available keys: ",
-                      ss.str());
-            abort();
-        }
-
-        auto table = session_table_it->second;
-        if (!table) {
-            LOG_ERROR("session_key: ", sess_key.to_string(), ", session table is empty. Abort");
-            abort();
-        }
-
-        session_ptr_t sess;
-        LOG_DEBUG("session_key: ",
-                  sess_key.to_string(),
-                  ", current sessions count: ",
-                  table->sessions.size());
-        auto session_it = table->sessions.find(sess_key);
-        if (session_it == table->sessions.end()) {
-            //create new session
-            sess = table->template create_session<observer::scale_out_session, class_id>(
-                sess_key, param, registered_index, registered_devices_count);
-        }
-        else {
-            //renew existing
-            sess = session_it->second;
-            sess->prepare(
-                registered_index, registered_devices_count, reinterpret_cast<void*>(&param));
-
-            //param.reset_counters(registered_index, container.size());
-        }
-
-        // notify actor-owner
-        const auto& thread_map =
-            ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>(
-                scaleout_workers);
-        auto actor_it = thread_map.find(observer_ptr);
-        if (actor_it == thread_map.end()) {
-            LOG_ERROR("Unregistered observer: ",
-                      observer_ptr->to_string(),
-                      ", thread_map size: ",
-                      thread_map.size(),
-                      " . Abort");
-            abort();
-        }
-
-        actor_it->second->start_job(sess);
-    }
-
-private:
-    template <ccl::device_topology_type class_id, class device_t>
-    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
-
-    using specific_devices_tuple_thread_map =
-        observer::multiple_device_thread_map_t<scaleout_actor,
-                                               SCALE_OUT_CTX_DEVICE_PROXY_TYPES(observer_t)>;
-    specific_devices_tuple_thread_map scaleout_workers;
-
-    template <class device_t>
-    void worker(observer_t<device_t>* device,
-                scaleout_actor* actor_ptr,
-                typename scaleout_actor::storage_t& todo_list);
-    size_t registered_devices_count{};
-
-    std::shared_ptr<ccl::host_communicator> process_communicator;
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp
deleted file mode 100644
index 6cbcbb5ab..000000000
--- a/src/common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/scale/scale_out/scale_out_ctx.hpp"
-#include "common/log/log.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-namespace native {
-
-#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
-#define TEMPLATE_DEF_ARG  Impl, types...
-
-template <TEMPLATE_DECL_ARG>
-void scale_out_ctx<TEMPLATE_DEF_ARG>::initialize_ctx(
-    std::shared_ptr<ccl::host_communicator> communicator) {
-    process_communicator = communicator;
-
-    LOG_DEBUG("SCALE-OUT context Initialized for mpi rank: (",
-              std::to_string(communicator->rank()),
-              "/",
-              std::to_string(communicator->size()),
-              ")");
-}
-
-// observer_ptr interface implementations
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type class_id, class device_t>
-void scale_out_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                             observer_t<device_t>* observer_ptr) {
-    LOG_DEBUG("scaleout device rank addr: ",
-              std::to_string(rank_addr),
-              ", device: ",
-              observer_ptr->to_string());
-    observer::container_t<observer_t<device_t>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            observables);
-    auto cont_it = container.find(observer_ptr);
-    if (cont_it == container.end()) {
-        container.insert(observer_ptr);
-        // remember total count
-        registered_devices_count++;
-
-        // prepare session tables
-        session_table_initializer init;
-        ccl_tuple_for_each_args(collective_sessions, init, observer_ptr);
-
-        if (rank_addr == std::numeric_limits<size_t>::max()) {
-            return; //nothing to do more
-        }
-    }
-
-    //reassign with index
-    assert(rank_addr != std::numeric_limits<size_t>::max() &&
-           "Reassign with assigned address failed");
-
-    observer::indexed_container_t<observer_t<device_t>>& indexed_container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, class_id>(
-            indexed_observables);
-
-    auto indexed_it = indexed_container.find(rank_addr);
-    if (indexed_it != indexed_container.end()) {
-        // collect troubleshooting info
-        std::stringstream ss;
-        for (const auto& indexed_dev : indexed_container) {
-            ss << "rank: " << indexed_dev.first << ", dev: " << indexed_dev.second->to_string()
-               << "\n";
-        }
-        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                 "- Cannot reassing rank: " + std::to_string(rank_addr) +
-                                 " for SCALEOUT device:\n" + observer_ptr->to_string() +
-                                 "\nBecause it registered already:\n" + ss.str());
-    }
-
-    indexed_container.emplace(rank_addr, observer_ptr);
-
-    // start SCALEOUT worker
-    auto& thread_map =
-        ccl_tuple_get<observer::device_thread_map<observer_t<device_t>, scaleout_actor>>(
-            scaleout_workers);
-    {
-        std::unique_ptr<scaleout_actor> new_actor{ new scaleout_actor(
-            rank_addr, &scale_out_ctx<TEMPLATE_DEF_ARG>::worker<device_t>, this, observer_ptr) };
-        thread_map[observer_ptr] = std::move(new_actor);
-    }
-}
-
-template <TEMPLATE_DECL_ARG>
-template <class device_t>
-void scale_out_ctx<TEMPLATE_DEF_ARG>::worker(observer_t<device_t>* listener_device,
-                                             scaleout_actor* actor_ptr,
-                                             typename scaleout_actor::storage_t& todo_list) {
-    LOG_DEBUG("Start SCALEOUT context worker, Listener device: ",
-              listener_device->to_string(),
-              ",\nactor_id: ",
-              actor_ptr->get_id(),
-              ",\ntodo list size: ",
-              todo_list.size());
-
-    // invoke CPU collective on data chunk
-    for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) {
-        session_ptr_t sess = *sess_it;
-
-        sess->produce_data(process_communicator);
-        ++sess_it;
-    }
-
-    // check CPU collective accomplishment
-    for (auto sess_it = todo_list.begin(); sess_it != todo_list.end();) {
-        (*sess_it)->consume_data(0 /*TODO !!!! */, process_communicator);
-        if ((*sess_it)->is_consumed()) {
-            sess_it = todo_list.erase(sess_it);
-        }
-        else {
-            ++sess_it;
-        }
-    }
-}
-
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp
deleted file mode 100644
index 5f27b6845..000000000
--- a/src/common/comm/l0/context/scale/scale_out/scale_out_session.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <sstream>
-
-#include "common/comm/l0/context/scale/scale_out/scale_out_session.hpp"
-#include "common/log/log.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-namespace native {
-namespace observer {
-
-std::string scale_out_session_iface::to_string() const {
-    std::stringstream ss;
-    ss << "sess: " << reinterpret_cast<const void*>(this);
-    return ss.str();
-}
-
-size_t scale_out_session_iface::get_send_tag() const {
-    return send_tag;
-}
-
-void ccl_worker_adapter::submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm,
-                                          const session_notification& in,
-                                          session_notification_handle& out,
-                                          const coll_param_gpu& kernel_params) {
-    // allreduce
-    if (kernel_params.get_coll_type() == ccl_coll_allreduce) {
-        out.output_buffer.resize(in.src_size_bytes);
-        ccl::stream::impl_value_t empty_stream{};
-
-        // notice: not thread-safe
-        out.op_handle = comm->allreduce_impl(in.host_src_ptr,
-                                             out.output_buffer.data(),
-                                             in.src_size_bytes,
-                                             kernel_params.get_datatype(),
-                                             kernel_params.get_reduction(),
-                                             empty_stream,
-                                             ccl::default_allreduce_attr,
-                                             {});
-        out.op_handle_ready = true;
-    }
-}
-
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp b/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp
deleted file mode 100644
index 43716108c..000000000
--- a/src/common/comm/l0/context/scale/scale_out/scale_out_session.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/type_traits.hpp"
-#include "oneapi/ccl/types_policy.hpp"
-
-#include "oneapi/ccl/event.hpp"
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-#include "common/comm/l0/context/scale/numa/numa_session.hpp"
-
-namespace ccl {
-class host_communicator;
-}
-
-namespace native {
-namespace observer {
-
-class scale_out_session_iface {
-public:
-    scale_out_session_iface() = default;
-    virtual ~scale_out_session_iface() = default;
-
-    size_t get_send_tag() const;
-    std::string to_string() const;
-
-    virtual void prepare(size_t observer_domain_index,
-                         size_t observer_domain_count,
-                         void* type_erased_param) = 0;
-    virtual void produce_data(std::shared_ptr<ccl::host_communicator>& comm) = 0;
-    virtual void consume_data(size_t observer_domain_index,
-                              std::shared_ptr<ccl::host_communicator>& comm) = 0;
-    virtual bool is_consumed() noexcept = 0;
-    virtual bool is_produced() noexcept = 0;
-
-private:
-    size_t send_tag{};
-};
-
-struct session_notification_handle {
-    using notification_handle_t = ccl::event;
-    //using notification_handle_ptr_t = std::unique_ptr<notification_handle_t>;
-
-    //TODO use custom allocator instead vector
-    std::vector<uint8_t> output_buffer;
-    notification_handle_t op_handle;
-    //TODO
-    // because notification_handle_t class interface do not provide distinction
-    // between canceled and finished use special flag is denoted extended state of op_handle.
-    // USE event_impl pointer instead! Fix host_communicator to return event_impl!
-    bool op_handle_ready;
-};
-
-struct ccl_worker_adapter {
-    static void submit_coll_work(std::shared_ptr<ccl::host_communicator>& comm,
-                                 const session_notification& in,
-                                 session_notification_handle& out,
-                                 const coll_param_gpu& kernel_params);
-};
-
-template <ccl::device_topology_type class_id, class session_invoke_params>
-struct scale_out_session : public scale_out_session_iface {
-    using base_t = scale_out_session_iface;
-    using invoke_params_t = session_invoke_params;
-    using session_key_t = session_key;
-
-    scale_out_session(producer_description& in_param,
-                      const coll_param_gpu& in_kernel_params,
-                      size_t observer_domain_index,
-                      size_t observer_domain_count,
-                      const session_key_t& key)
-            : base_t(),
-              proxy_session(in_param,
-                            in_kernel_params,
-                            observer_domain_index,
-                            observer_domain_count,
-                            key) {
-        //TODO use `session_invoke_params` information to calculate possible `pending_notifications` reserve
-        // based on chunk size
-        pending_notifications.reserve(16);
-    }
-
-    context_descr& get_ctx_descr() {
-        return proxy_session.get_ctx_descr();
-    }
-
-    void prepare(size_t observer_domain_index,
-                 size_t observer_domain_count,
-                 void* type_erased_param) override {
-        proxy_session.prepare(observer_domain_index, observer_domain_count, type_erased_param);
-
-        auto* out_param = static_cast<invoke_params_t*>(type_erased_param);
-
-        // allocate cpu gw staging slots
-        pending_notifications.clear();
-
-        (void)out_param;
-    }
-
-    void produce_data(std::shared_ptr<ccl::host_communicator>& comm) override {
-        void* partial_chunk = nullptr;
-        size_t partial_chunk_size = 0;
-
-        // get own device partial chunk data
-        proxy_session.produce_data(&partial_chunk, partial_chunk_size);
-        if (partial_chunk_size > 0) {
-            // notify other scaleout actors in other processes about partial my result
-            session_notification notif(partial_chunk, partial_chunk_size);
-            session_notification_handle handle;
-
-            ccl_worker_adapter::submit_coll_work(
-                comm, notif, handle, proxy_session.get_kernel_params());
-
-            pending_notifications.push_back(std::move(handle));
-        }
-    }
-
-    void consume_data(size_t observer_domain_index,
-                      std::shared_ptr<ccl::host_communicator>& comm) override {
-        for (auto it = pending_notifications.begin(); it != pending_notifications.end(); ++it) {
-            if (it->op_handle_ready) { // notice: not thread-safe
-
-                if (it->op_handle.test()) {
-                    proxy_session.consume_data(
-                        observer_domain_index,
-                        it->output_buffer.data(),
-                        it->output_buffer.size() *
-                            ccl::get_datatype_size(
-                                proxy_session.get_kernel_params().get_datatype()));
-
-                    // notice: not thread-safe
-                    it->op_handle_ready = false;
-                }
-                else {
-                    //TODO collectives on CPU side are processing sequencially
-                    // if the first handle is not ready yet, then skip following handles
-                    break;
-                }
-            }
-        }
-    }
-
-    bool is_consumed() noexcept override {
-        return proxy_session.is_consumed();
-    }
-
-    bool is_produced() noexcept override {
-        return proxy_session.is_produced();
-    }
-
-private:
-    void notify_data();
-    numa_session<class_id, invoke_params_t> proxy_session;
-    std::vector<session_notification_handle> pending_notifications;
-};
-} // namespace observer
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp
deleted file mode 100644
index 63d540dc5..000000000
--- a/src/common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/base_scaling_ctx.hpp"
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-template <class device>
-class ccl_gpu_scaleup_proxy;
-
-template <class device>
-class ccl_numa_proxy;
-
-template <class Impl, ccl::device_topology_type... types>
-class scale_up_ctx : public observer::base_scaling_ctx<
-                         scale_up_ctx<Impl, types...>,
-                         ccl_gpu_scaleup_proxy<ccl_gpu_comm>,
-                         ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>,
-                         ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>,
-                         ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>> {
-public:
-    using context_impl = Impl;
-
-    template <class device_t>
-    using observer_t = ccl_gpu_scaleup_proxy<device_t>;
-
-    using scaling_ctx_base_t =
-        observer::base_scaling_ctx<scale_up_ctx<Impl, types...>,
-                                   observer_t<ccl_gpu_comm>,
-                                   observer_t<ccl_virtual_gpu_comm>,
-                                   observer_t<ccl_numa_proxy<ccl_gpu_comm>>,
-                                   observer_t<ccl_numa_proxy<ccl_virtual_gpu_comm>>>;
-    using observable_scale_up_topologies =
-        typename scaling_ctx_base_t::template observable_topologies<types...>;
-
-    observable_scale_up_topologies observables;
-
-    //observer subject interface implementations
-    template <class device_t, ccl::device_topology_type topology_type>
-    void attach_ctx_observer(size_t rank_addr,
-                             observer_t<device_t>* observer_ptr,
-                             std::integral_constant<ccl::device_topology_type, topology_type> val) {
-        register_observer_impl<topology_type>(rank_addr, observer_ptr);
-    }
-
-    void invoke_ctx_observer(
-        observer_t<ccl_gpu_comm>* observer_ptr,
-        std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::ring> val);
-    void invoke_ctx_observer(
-        observer_t<ccl_virtual_gpu_comm>* observer_ptr,
-        std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::ring> val);
-    void invoke_ctx_observer(
-        observer_t<ccl_gpu_comm>* observer_ptr,
-        std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::a2a> val);
-    void invoke_ctx_observer(
-        observer_t<ccl_virtual_gpu_comm>* observer_ptr,
-        std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::a2a> val);
-
-private:
-    template <ccl::device_topology_type topology_type, class device_t>
-    void register_observer_impl(size_t rank_addr, observer_t<device_t>* observer_ptr);
-};
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp b/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp
deleted file mode 100644
index 096aa722e..000000000
--- a/src/common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/scale/scale_up/scale_up_ctx.hpp"
-
-namespace native {
-#define TEMPLATE_DECL_ARG class Impl, ccl::device_topology_type... types
-#define TEMPLATE_DEF_ARG  Impl, types...
-
-// observer_ptr interface implementations
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type topology_type, class device_t>
-void scale_up_ctx<TEMPLATE_DEF_ARG>::register_observer_impl(size_t rank_addr,
-                                                            observer_t<device_t>* observer_ptr) {
-    observer::container_t<observer_t<device_t>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<device_t>, topology_type>(
-            observables);
-    container.insert(observer_ptr);
-}
-
-template <TEMPLATE_DECL_ARG>
-void scale_up_ctx<TEMPLATE_DEF_ARG>::invoke_ctx_observer(
-    observer_t<ccl_gpu_comm>* observer_ptr,
-    std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::ring> val) {
-    observer::container_t<observer_t<ccl_gpu_comm>>& container =
-        scaling_ctx_base_t::template get_types_container<observer_t<ccl_gpu_comm>,
-                                                         ccl::device_topology_type::ring>(
-            observables);
-    auto it = container.find(observer_ptr);
-    if (it == container.end()) {
-        throw std::runtime_error(std::string("invalid proxy: ") + observer_ptr->to_string());
-    }
-
-    throw std::runtime_error(std::string("Valid proxy: ") + observer_ptr->to_string());
-}
-
-template <TEMPLATE_DECL_ARG>
-void scale_up_ctx<TEMPLATE_DEF_ARG>::invoke_ctx_observer(
-    observer_t<ccl_virtual_gpu_comm>* observer_ptr,
-    std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::ring> val) {
-    throw std::runtime_error(std::string("Valid proxy: ") + observer_ptr->to_string());
-}
-
-template <TEMPLATE_DECL_ARG>
-void scale_up_ctx<TEMPLATE_DEF_ARG>::invoke_ctx_observer(
-    observer_t<ccl_gpu_comm>* observer_ptr,
-    std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::a2a> val) {
-    throw std::runtime_error(std::string("Valid proxy: ") + observer_ptr->to_string());
-}
-
-template <TEMPLATE_DECL_ARG>
-void scale_up_ctx<TEMPLATE_DEF_ARG>::invoke_ctx_observer(
-    observer_t<ccl_virtual_gpu_comm>* observer_ptr,
-    std::integral_constant<ccl::device_topology_type, ccl::device_topology_type::a2a> val) {
-    throw std::runtime_error(std::string("Valid proxy: ") + observer_ptr->to_string());
-}
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-} // namespace native
diff --git a/src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp b/src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp
deleted file mode 100644
index 88aa3ecb4..000000000
--- a/src/common/comm/l0/context/scale/scaling_context_dispatcher.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <tuple>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#include <type_traits>
-#include "common/comm/l0/devices/proxy_observer.hpp"
-
-namespace native {
-
-template <class Device, class... Others>
-struct ctx_resolver {};
-
-template <class Device, class T, class... Ctx>
-struct ctx_resolver<Device, T, Ctx...> {
-    using type = typename std::conditional<T::template is_registered_device_t<Device>(),
-                                           T,
-                                           typename ctx_resolver<Device, Ctx...>::type>::type;
-};
-
-struct null_ctx_t {};
-
-template <class Device, class T>
-struct ctx_resolver<Device, T> {
-    using type = typename std::
-        conditional<T::template is_registered_device_t<Device>(), T, null_ctx_t>::type;
-};
-
-template <class... Contexts>
-struct scaling_ctx_dispatcher : public Contexts... {
-    template <class device_t>
-    typename std::add_pointer<typename ctx_resolver<device_t, Contexts...>::type>::type
-    dispatch_context() {
-        using resolved_ctx_t = typename ctx_resolver<device_t, Contexts...>::type;
-
-        static_assert(not std::is_same<resolved_ctx_t, null_ctx_t>::value,
-                      "Not found scaling context type for requested `device_t`");
-
-        using scaling_ctx_pointer_t = typename std::add_pointer<resolved_ctx_t>::type;
-        return static_cast<scaling_ctx_pointer_t>(this);
-    }
-};
-
-} //namespace native
diff --git a/src/common/comm/l0/context/scaleup_ctx_types.hpp b/src/common/comm/l0/context/scaleup_ctx_types.hpp
deleted file mode 100644
index e5496b6eb..000000000
--- a/src/common/comm/l0/context/scaleup_ctx_types.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "base_scaling_ctx.hpp"
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_virtual_gpu_comm;
-
-template <class ctx_impl_t>
-using scale_up_ctx_specific = base_scaling_ctx<ctx_impl_t, ccl_gpu_comm, ccl_virtual_gpu_comm>;
-} // namespace native
diff --git a/src/common/comm/l0/context/thread_group_ctx.cpp b/src/common/comm/l0/context/thread_group_ctx.cpp
deleted file mode 100644
index 24b42ee5a..000000000
--- a/src/common/comm/l0/context/thread_group_ctx.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/device_community_holder_impl.hpp"
-#include "common/comm/l0/topology/ring/thread_group_ring_creator.hpp"
-#include "common/comm/l0/context/device_storage.hpp"
-
-#include "common/comm/l0/scheduler/thread_group_scheduler.hpp"
-#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
-
-namespace native {
-
-thread_group_context::~thread_group_context() {}
-
-bool thread_group_context::sync_barrier(const ccl::device_indices_type& device_indices_t,
-                                        ccl::context_comm_addr& comm_addr,
-                                        device_storage& devices) {
-    std::shared_ptr<specific_plain_device_storage> thread_device_list;
-
-    // Collect per thread data
-
-    //comm_addr.thread_idx = thread_device_group_ctx.size();
-    aggregate_device_indices(comm_addr.thread_idx, device_indices_t);
-
-    {
-        //check on group creation final condition
-        device_group_ctx_ptr group_ctx =
-            device_group_context::create(comm_addr, device_indices_t, devices);
-        if (false == thread_device_group_ctx.insert({ comm_addr.thread_idx, group_ctx }).second) {
-            LOG_ERROR("cannot register devices group ctx for thread idx: ", comm_addr.thread_idx);
-            abort();
-        }
-    }
-
-    LOG_DEBUG("Thread ", comm_addr.to_string(), " reached thread group communicator barrier");
-
-    // prepare device communities
-    auto& ring_container = thread_device_topology[comm_addr.thread_idx]
-                               .get_community<ccl::device_topology_type::ring>();
-    (void)ring_container;
-
-    auto& a2a_container = thread_device_topology[comm_addr.thread_idx]
-                              .get_community<ccl::device_topology_type::a2a>();
-    a2a_container.set_topology(
-        std::make_shared<device_community<ccl::device_topology_type::a2a>>(comm_addr));
-
-    if (thread_device_group_ctx.size() != comm_addr.thread_count) {
-        // not all threads are registered yet - wait for all
-        LOG_DEBUG("Thread ", comm_addr.to_string(), " waits on barrier");
-        return false; //slave thread
-    }
-
-    //Current thread finalize communicator creation
-    LOG_DEBUG("Thread ", comm_addr.to_string(), " starts hardware topologies creation");
-    {
-        std::stringstream ss;
-        thread_group_ring_topology top(*this, devices);
-        auto matrix = top.build_p2p_capability_matrix(ss, per_thread_indices);
-        if (!top.build(ss, comm_addr, per_thread_indices, matrix)) {
-            LOG_ERROR("Cannot build THREAD_GROUP_RING. Build log:\n", ss.str());
-        }
-
-        LOG_DEBUG("Topologies RING created successfully. Log:\n", ss.str());
-    }
-
-    {
-        //TODO Create A2A topology
-        LOG_DEBUG("Thread Context Topologies A2A TODO");
-    }
-
-    {
-        std::stringstream out;
-        dump_thread_topologies(out);
-        LOG_DEBUG("Thread (MASTER): ",
-                  comm_addr.to_string(),
-                  " finalized thread topology creation\n",
-                  out.str());
-    }
-    // create scheduler in final step
-    scheduler_impl.reset(new thread_group_scheduler(comm_addr.thread_count));
-    LOG_DEBUG("Final thread ", comm_addr.to_string(), " ready to communicate");
-
-    return true; //master thread
-}
-
-void thread_group_context::aggregate_device_indices(size_t thread_id,
-                                                    const ccl::device_indices_type& new_indices) {
-    per_thread_indices.insert({ thread_id, new_indices });
-}
-
-const ccl::process_device_indices_type& thread_group_context::get_thread_group_device_indices()
-    const {
-    return per_thread_indices;
-}
-
-const ccl::device_indices_type& thread_group_context::get_device_group_indices(
-    size_t thread_id) const {
-    auto it = per_thread_indices.find(thread_id);
-    if (it == per_thread_indices.end()) {
-        LOG_ERROR("Cannot find device group for thread: ", thread_id, ". Empty indices");
-        static const ccl::device_indices_type empty;
-        return empty;
-    }
-    return it->second;
-}
-
-thread_group_context::device_group_ctx_ptr thread_group_context::get_device_group_ctx(
-    size_t thread_id) {
-    auto it = thread_device_group_ctx.find(thread_id);
-    if (it == thread_device_group_ctx.end()) {
-        LOG_ERROR("Cannot find device group for thread: ", thread_id, ". Empty context");
-        return {};
-    }
-    return it->second;
-}
-
-void thread_group_context::dump_thread_topologies(std::ostream& out) const {
-    out << "Threads count: " << thread_device_topology.size() << std::endl;
-    for (auto it = thread_device_topology.begin(); it != thread_device_topology.end(); ++it) {
-        const auto& top = it->second;
-        size_t thread = it->first;
-
-        out << "\nThread Group: " << thread << " topology:\n" << top.to_string();
-    }
-}
-
-thread_group_context::scaling_context_base& thread_group_context::get_numa_ctx() {
-    return *this;
-}
-const thread_group_context::scaling_context_base& thread_group_context::get_numa_ctx() const {
-    return *this;
-}
-} // namespace native
diff --git a/src/common/comm/l0/context/thread_group_ctx.hpp b/src/common/comm/l0/context/thread_group_ctx.hpp
deleted file mode 100644
index 3e80a235f..000000000
--- a/src/common/comm/l0/context/thread_group_ctx.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/context/device_group_ctx.hpp"
-#include "common/log/log.hpp"
-
-#include "common/comm/l0/context/scale/numa/numa_ctx.hpp"
-
-namespace native {
-struct device_storage;
-struct thread_group_scheduler;
-
-struct thread_group_context : numa_ctx<thread_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST> {
-    using scaling_context_base =
-        numa_ctx<thread_group_context, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-
-    friend class device_group_ring_topology;
-    friend class thread_group_ring_topology;
-    friend class allied_process_group_ring_topology;
-
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::process;
-    }
-
-    using topologies = device_group_community_holder<ccl::group_split_type::process,
-                                                     SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
-    using topologies_storage = std::map<size_t, topologies>;
-    using device_group_ctx_ptr = std::shared_ptr<device_group_context>;
-    using device_group_ctx_storage = std::map<size_t, device_group_ctx_ptr>;
-
-    ~thread_group_context();
-    bool sync_barrier(const ccl::device_indices_type& thread_device_mask,
-                      ccl::context_comm_addr& comm_addr,
-                      device_storage& devices);
-
-    const ccl::process_device_indices_type& get_thread_group_device_indices() const;
-    const ccl::device_indices_type& get_device_group_indices(size_t thread_id) const;
-
-    template <ccl::device_topology_type class_id>
-    typename std::tuple_element<class_id, typename topologies::device_topologies_t>::type&
-    get_thread_topology(size_t thread_id) {
-        auto it = thread_device_topology.find(thread_id);
-        if (it == thread_device_topology.end()) {
-            LOG_ERROR("Cannot find device group for thread: ", thread_id, ". Empty topology");
-            static
-                typename std::tuple_element<class_id,
-                                            typename topologies::device_topologies_t>::type empty;
-            return empty;
-        }
-        return it->second.get_community<class_id>();
-    }
-
-    device_group_ctx_ptr get_device_group_ctx(size_t thread_id);
-
-    std::unique_ptr<thread_group_scheduler> scheduler_impl;
-
-    void dump_thread_topologies(std::ostream& out) const;
-
-    scaling_context_base& get_numa_ctx();
-    const scaling_context_base& get_numa_ctx() const;
-
-private:
-    ccl::process_device_indices_type per_thread_indices;
-    device_group_ctx_storage thread_device_group_ctx;
-    topologies_storage thread_device_topology;
-
-    void aggregate_device_indices(size_t thread_id, const ccl::device_indices_type& new_indices);
-};
-} // namespace native
diff --git a/src/common/comm/l0/context_comm_addr.cpp b/src/common/comm/l0/context_comm_addr.cpp
deleted file mode 100644
index 5a8f6f17b..000000000
--- a/src/common/comm/l0/context_comm_addr.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <sstream>
-
-#include "common/comm/l0/context_comm_addr.hpp"
-namespace ccl {
-
-std::string context_comm_addr::to_string() const {
-    std::stringstream ss;
-    ss << "thread(" << thread_idx << "/" << thread_count << "), rank(" << comm_rank << "/"
-       << comm_size << ")";
-    return ss.str();
-}
-} // namespace ccl
diff --git a/src/common/comm/l0/device_community.hpp b/src/common/comm/l0/device_community.hpp
deleted file mode 100644
index ba992b0cd..000000000
--- a/src/common/comm/l0/device_community.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
-#include "common/comm/l0/device_group_routing_schema.hpp"
-#include "common/comm/l0/devices/devices_declaration.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/device_community_utils.hpp"
-
-namespace native {
-
-template <ccl::device_topology_type schema_id>
-struct device_community {
-    device_community(const ccl::context_comm_addr& comm_addr)
-            : community_addr(comm_addr),
-              devices(new specific_indexed_device_storage()) {}
-
-    device_rank_table_t registered_device_id;
-
-    specific_indexed_device_storage& get_device_storage() {
-        auto& ptr = get_impl();
-        if (!ptr) {
-            abort();
-        }
-        return *ptr;
-    }
-
-    template <class device_t>
-    indexed_device_container<device_t>& get_devices() {
-        static native::indexed_device_container<device_t> empty;
-
-        return devices ? std::get<device_t::type_idx()>(*devices) : empty;
-    }
-
-    template <class device_t>
-    size_t get_device_count() const {
-        return devices ? std::get<device_t::type_idx()>(*devices).size() : 0;
-    }
-
-    template <ccl::group_split_type group_id, class... DeviceTypes>
-    void bind_device_by_id(const ccl::device_index_type& device_id,
-                           ccl::context_comm_addr& registered_addr,
-                           device_variant_t<DeviceTypes...>& out_binder,
-                           size_t preferred_rank) {
-        if (!get_impl()) {
-            std::string err_str;
-            {
-                std::stringstream str;
-                ccl_logger::format(str,
-                                   "Cannot initialize comm_addr for device id: ",
-                                   device_id,
-                                   " on topology: ",
-                                   ::to_string(group_id),
-                                   ", class: ",
-                                   ::to_string(schema_id),
-                                   ", empty device storage has got from context");
-                err_str = str.str();
-            }
-            LOG_ERROR(err_str);
-            throw std::runtime_error(err_str);
-        }
-
-        if (registered_addr.comm_rank != 0 or registered_addr.comm_size != 0) {
-            std::string err_str;
-            {
-                std::stringstream str;
-                ccl_logger::format(str,
-                                   "Cannot register_device_by_id in topology for device id: ",
-                                   device_id,
-                                   " on topology: ",
-                                   ::to_string(group_id),
-                                   ", class: ",
-                                   ::to_string(schema_id),
-                                   ", because topology registered already, comm addr:",
-                                   registered_addr.to_string());
-                err_str = str.str();
-            }
-            LOG_ERROR(err_str);
-            throw std::runtime_error(err_str);
-        }
-
-        // find device in topology and obtain its rank/sie
-        detail::rank_binder<group_id, schema_id> initializer(
-            device_id, get_device_storage(), registered_device_id, preferred_rank);
-        ccl_tuple_for_each(out_binder, initializer);
-
-        // copy shared data from community addr
-        registered_addr = community_addr;
-
-        // get individual rank from initializer
-        registered_addr.comm_rank = initializer.get_assigned_rank();
-    }
-
-    const ccl::context_comm_addr& get_comm_addr() const noexcept {
-        return community_addr;
-    }
-
-    template <ccl::group_split_type group_id>
-    std::string to_string() const {
-        std::stringstream result;
-        result << "Topology: " << ::to_string(schema_id) << "\n";
-        native::detail::printer<group_id, schema_id> p;
-        if (devices) {
-            ccl_tuple_for_each(*devices, p);
-            result << p.to_string();
-        }
-        else {
-            result << "EMPTY";
-        }
-        return result.str();
-    }
-
-private:
-    ccl::context_comm_addr community_addr;
-    std::unique_ptr<specific_indexed_device_storage>& get_impl() {
-        return devices;
-    }
-
-    std::unique_ptr<specific_indexed_device_storage> devices;
-};
-} // namespace native
diff --git a/src/common/comm/l0/device_community_holder.hpp b/src/common/comm/l0/device_community_holder.hpp
deleted file mode 100644
index 0d973af92..000000000
--- a/src/common/comm/l0/device_community_holder.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <tuple>
-#include <vector>
-
-#include "oneapi/ccl/types.hpp"
-#include "common/comm/l0/device_community.hpp"
-
-namespace native {
-
-template <ccl::device_topology_type class_id>
-struct device_community_container {
-    using element_type = std::shared_ptr<device_community<class_id>>;
-    using container_type = element_type;
-
-    container_type storage;
-
-    element_type get_topology() {
-        return storage;
-    }
-    void set_topology(element_type item) {
-        storage = item;
-    }
-
-    template <ccl::group_split_type group_id, class... DeviceTypes>
-    void bind_device_by_id(const ccl::device_index_type& device_id,
-                           ccl::context_comm_addr& registered_addr,
-                           device_variant_t<DeviceTypes...>& out_binder,
-                           size_t preferred_rank = std::numeric_limits<size_t>::max());
-};
-
-template <>
-struct device_community_container<ccl::device_topology_type::ring> {
-    using element_type = std::shared_ptr<device_community<ccl::device_topology_type::ring>>;
-
-    using container_type = std::vector<element_type>;
-    container_type closed_rings;
-    container_type torn_apart_rings;
-
-    const element_type get_topology(size_t ring_index = 0) const {
-        return closed_rings.at(ring_index);
-    }
-
-    element_type get_topology(size_t ring_index) {
-        return closed_rings.at(ring_index);
-    }
-    void set_topology(element_type item) {
-        closed_rings.push_back(std::move(item));
-    }
-
-    const element_type get_additiona_topology(size_t ring_index) const {
-        return torn_apart_rings.at(ring_index);
-    }
-    element_type get_additiona_topology(size_t ring_index) {
-        return torn_apart_rings.at(ring_index);
-    }
-
-    void set_additiona_topology(element_type item) {
-        torn_apart_rings.push_back(std::move(item));
-    }
-
-    template <ccl::group_split_type group_id, class... DeviceTypes>
-    void bind_device_by_id(const ccl::device_index_type& device_id,
-                           ccl::context_comm_addr& registered_addr,
-                           device_variant_t<DeviceTypes...>& out_binder,
-                           size_t preferred_rank = std::numeric_limits<size_t>::max());
-};
-
-template <ccl::group_split_type group_id, ccl::device_topology_type... class_id>
-class device_group_community_holder {
-public:
-    using device_topologies_t = std::tuple<device_community_container<class_id>...>;
-    using self_t = device_group_community_holder<group_id, class_id...>;
-
-    template <ccl::device_topology_type requested_id>
-    const device_community_container<requested_id>& get_community() const;
-
-    template <ccl::device_topology_type requested_id>
-    device_community_container<requested_id>& get_community();
-
-    template <ccl::device_topology_type requested_id, class... DeviceTypes>
-    void bind_device_by_id(const ccl::device_index_type& device_id,
-                           ccl::context_comm_addr& registered_addr,
-                           device_variant_t<DeviceTypes...>& out_binder,
-                           size_t preferred_rank = std::numeric_limits<size_t>::max());
-    std::string to_string() const;
-
-private:
-    device_topologies_t typed_communities;
-};
-} // namespace native
diff --git a/src/common/comm/l0/device_community_holder_impl.hpp b/src/common/comm/l0/device_community_holder_impl.hpp
deleted file mode 100644
index 8525fe4d7..000000000
--- a/src/common/comm/l0/device_community_holder_impl.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/device_community_holder.hpp"
-#include "common/comm/l0/device_community_holder_utils.hpp"
-
-namespace native {
-#define TEMPLATE_DECL_ARG ccl::group_split_type group_id, ccl::device_topology_type... class_id
-#define TEMPLATE_DEF_ARG  group_id, class_id...
-
-// community impl
-template <ccl::device_topology_type class_id>
-template <ccl::group_split_type group_id, class... DeviceTypes>
-void device_community_container<class_id>::bind_device_by_id(
-    const ccl::device_index_type& device_id,
-    ccl::context_comm_addr& registered_addr,
-    device_variant_t<DeviceTypes...>& out_device_binder,
-    size_t preferred_rank) {
-    storage->template bind_device_by_id<group_id>(
-        device_id, registered_addr, out_device_binder, preferred_rank);
-}
-
-template <ccl::group_split_type group_id, class... DeviceTypes>
-void device_community_container<ccl::device_topology_type::ring>::bind_device_by_id(
-    const ccl::device_index_type& device_id,
-    ccl::context_comm_addr& registered_addr,
-    device_variant_t<DeviceTypes...>& out_device_binder,
-    size_t preferred_rank) {
-    for (auto it = closed_rings.begin(); it != closed_rings.end(); ++it) {
-        (*it)->template bind_device_by_id<group_id>(
-            device_id, registered_addr, out_device_binder, preferred_rank);
-    }
-
-    for (auto it = torn_apart_rings.begin(); it != torn_apart_rings.end(); ++it) {
-        (*it)->template bind_device_by_id<group_id>(
-            device_id, registered_addr, out_device_binder, preferred_rank);
-    }
-}
-
-// implementation
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type requested_id>
-const device_community_container<requested_id>&
-device_group_community_holder<TEMPLATE_DEF_ARG>::get_community() const {
-    return std::get<requested_id>(typed_communities);
-}
-
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type requested_id>
-device_community_container<requested_id>&
-device_group_community_holder<TEMPLATE_DEF_ARG>::get_community() {
-    return const_cast<device_community_container<requested_id>&>(
-        static_cast<const self_t*>(this)->get_community<requested_id>());
-}
-
-template <TEMPLATE_DECL_ARG>
-std::string device_group_community_holder<TEMPLATE_DEF_ARG>::to_string() const {
-    std::stringstream ss;
-    detail::device_community_container_print_helper<group_id> p(ss);
-    ccl_tuple_for_each(typed_communities, p);
-    return ss.str();
-}
-
-template <TEMPLATE_DECL_ARG>
-template <ccl::device_topology_type requested_class_id, class... DeviceTypes>
-void device_group_community_holder<TEMPLATE_DEF_ARG>::bind_device_by_id(
-    const ccl::device_index_type& device_id,
-    ccl::context_comm_addr& registered_addr,
-    device_variant_t<DeviceTypes...>& out_binder,
-    size_t preferred_rank) {
-    device_community_container<requested_class_id>& container =
-        this->template get_community<requested_class_id>();
-    container->bind_device_by_id(device_id, registered_addr, out_binder, preferred_rank);
-}
-
-#undef TEMPLATE_DECL_ARG
-#undef TEMPLATE_DEF_ARG
-} // namespace native
diff --git a/src/common/comm/l0/device_community_holder_utils.hpp b/src/common/comm/l0/device_community_holder_utils.hpp
deleted file mode 100644
index db9f2a007..000000000
--- a/src/common/comm/l0/device_community_holder_utils.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/device_community_holder.hpp"
-
-namespace native {
-
-/**
- *
- * Declarations
- *
- */
-namespace detail {
-/**
- * class for pretty topology printing
- */
-template <ccl::group_split_type group_id>
-struct device_community_container_print_helper {
-    device_community_container_print_helper(std::ostream& out);
-
-    template <ccl::device_topology_type class_id>
-    void operator()(const device_community_container<class_id>& topology_container);
-
-    // 'ring' requires overloading
-    void operator()(
-        const device_community_container<ccl::device_topology_type::ring>& topology_container);
-
-private:
-    std::ostream& output;
-};
-} // namespace detail
-
-/**
- *
- * Definitions
- *
- */
-namespace detail {
-
-/**
- * class for pretty topology printing definition
- */
-
-template <ccl::group_split_type group_id>
-device_community_container_print_helper<group_id>::device_community_container_print_helper(
-    std::ostream& out)
-        : output(out) {}
-
-template <ccl::group_split_type group_id>
-template <ccl::device_topology_type class_id>
-void device_community_container_print_helper<group_id>::operator()(
-    const device_community_container<class_id>& topology_container) {
-    output << ::to_string(class_id) << "\n\t"
-           << topology_container.storage->template to_string<group_id>();
-}
-
-template <ccl::group_split_type group_id>
-void device_community_container_print_helper<group_id>::operator()(
-    const device_community_container<ccl::device_topology_type::ring>& topology_container) {
-    output << ::to_string(ccl::device_topology_type::ring)
-           << "\n\t, closed rings: " << topology_container.closed_rings.size() << std::endl;
-    for (size_t i = 0; i < topology_container.closed_rings.size(); i++) {
-        output << "\t\t" << topology_container.closed_rings[i]->template to_string<group_id>();
-    }
-
-    output << "\n\t, torn-apart rings: " << topology_container.torn_apart_rings.size() << std::endl;
-    for (size_t i = 0; i < topology_container.torn_apart_rings.size(); i++) {
-        output << "\t\t" << topology_container.torn_apart_rings[i]->template to_string<group_id>();
-    }
-}
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/device_community_utils.hpp b/src/common/comm/l0/device_community_utils.hpp
deleted file mode 100644
index 730ad77a4..000000000
--- a/src/common/comm/l0/device_community_utils.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-namespace native {
-
-using device_rank_table_t = std::multimap<ccl::device_index_type, size_t /**external rank**/>;
-
-namespace detail {
-
-/**
- *
- */
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct rank_binder {
-    rank_binder(const ccl::device_index_type& in_device_idx,
-                const native::specific_indexed_device_storage& in_dev_storage,
-                device_rank_table_t& out_registered_ids,
-                size_t preferred_rank = std::numeric_limits<size_t>::max());
-
-    template <class device_t>
-    void operator()(native::device_t_ptr<device_t>& container);
-
-    int get_assigned_rank() const;
-    int get_assigned_size() const;
-
-private:
-    ccl::device_index_type device_id;
-    const native::specific_indexed_device_storage& in_device_storage;
-    device_rank_table_t& registered_device_id;
-    int rank = 0;
-    int size = 0;
-    bool find = false;
-    size_t enumerator = 0;
-    size_t preferred_rank_id;
-};
-
-/**
- *
- */
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-rank_binder<group_id, class_id>::rank_binder(
-    const ccl::device_index_type& in_device_idx,
-    const native::specific_indexed_device_storage& in_dev_storage,
-    device_rank_table_t& registered_ids,
-    size_t preferred_rank)
-        : device_id(in_device_idx),
-          in_device_storage(in_dev_storage),
-          registered_device_id(registered_ids),
-          preferred_rank_id(preferred_rank) {}
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-template <class device_t>
-void rank_binder<group_id, class_id>::operator()(native::device_t_ptr<device_t>& comm_device) {
-    if (find) {
-        return;
-    }
-
-    const native::indexed_device_container<device_t>& in_typed_container =
-        ccl_tuple_get<native::indexed_device_container<device_t>>(in_device_storage);
-
-    for (const auto& dev : in_typed_container) {
-        ccl_device& device = dev.second->get_device();
-        const ccl::device_index_type& find_id = device.get_device_path();
-        if (find_id == device_id) {
-            if (enumerator == registered_device_id.count(device_id)) {
-                find = true;
-
-                // set rank for device: automatically or user-preferred
-                if (preferred_rank_id == std::numeric_limits<size_t>::max()) {
-                    // automatically from logical topology
-                    rank = dev.first;
-                }
-                else {
-                    //use user defined rank
-                    rank = preferred_rank_id;
-                }
-                registered_device_id.insert({ device_id, rank });
-
-                //bind device
-                comm_device = dev.second;
-            }
-            enumerator++;
-        }
-
-        if (find) {
-            return;
-        }
-    }
-}
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-int rank_binder<group_id, class_id>::get_assigned_rank() const {
-    if (!find) {
-        throw std::runtime_error(
-            std::string(__FUNCTION__) +
-            "rank_binder doesn't assign rank for device: " + ccl::to_string(device_id));
-    }
-    return rank;
-}
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/device_containers.hpp b/src/common/comm/l0/device_containers.hpp
deleted file mode 100644
index bf712866b..000000000
--- a/src/common/comm/l0/device_containers.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <tuple>
-#include <vector>
-
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/native_device_api/l0/declarations.hpp"
-#include "common/utils/tuple.hpp"
-#include "common/utils/utils.hpp"
-#include "common/comm/l0/device_types.hpp"
-
-namespace native {
-
-template <class device_t>
-using device_container = std::multimap<ccl_device::handle_t, device_t_ptr<device_t>>;
-
-template <class... device_t>
-using device_storage_t = std::tuple<device_container<device_t>...>;
-
-// non-enumeration devices
-template <class device_t>
-struct plain_device_container : std::vector<device_t_ptr<device_t>> {};
-template <class... device_t>
-using plain_device_storage = std::tuple<plain_device_container<device_t>...>;
-
-// enumeration devices
-template <class device_t>
-struct indexed_device_container : std::map<size_t, device_t_ptr<device_t>> {};
-/* TODO no need actually at now
-template<>  //thread safe for concurrent
-struct indexed_device_container<ccl_thread_comm<ccl_gpu_comm>> : std::map<size_t, device_t_ptr<ccl_thread_comm<ccl_gpu_comm>> > {};
-template<>  //thread safe for concurrent
-struct indexed_device_container<ccl_thread_comm<ccl_virtual_gpu_comm>> : std::map<size_t, device_t_ptr<ccl_thread_comm<ccl_virtual_gpu_comm>> > {};
-*/
-template <class... device_t>
-using indexed_device_storage = std::tuple<indexed_device_container<device_t>...>;
-
-template <class... device_t>
-using device_variant_t = std::tuple<device_t_ptr<device_t>...>; //TOSO use std::variant
-
-namespace detail {
-//TODO - use traits
-template <class device_t, class... total_devices_t>
-inline size_t get_size(const native::device_storage_t<total_devices_t...>& gpu_device_storage) {
-    return ccl_tuple_get<native::device_container<device_t>>(gpu_device_storage).size();
-}
-
-template <class device_t, class... total_devices_t>
-inline size_t get_size(const native::plain_device_storage<total_devices_t...>& gpu_device_storage) {
-    return ccl_tuple_get<native::plain_device_container<device_t>>(gpu_device_storage).size();
-}
-
-template <class device_t, class... total_devices_t>
-inline size_t get_size(
-    const native::indexed_device_storage<total_devices_t...>& gpu_device_storage) {
-    return ccl_tuple_get<native::indexed_device_container<device_t>>(gpu_device_storage).size();
-}
-
-template <class Container>
-inline size_t get_aggregated_size(const Container& gpu_device_storage) {
-    return 0;
-}
-
-template <class Container, class DeviceType, class... Types>
-inline size_t get_aggregated_size(const Container& gpu_device_storage) {
-    return get_size<DeviceType>(gpu_device_storage) +
-           get_aggregated_size<Container, Types...>(gpu_device_storage);
-}
-
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/device_containers_utils.hpp b/src/common/comm/l0/device_containers_utils.hpp
deleted file mode 100644
index 8ee74e83a..000000000
--- a/src/common/comm/l0/device_containers_utils.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <sstream>
-
-#include "oneapi/ccl/types.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
-#include "common/comm/l0/device_containers.hpp"
-
-namespace native {
-
-namespace detail {
-/*
-struct splice_devices
-{
-    splice_devices(std::shared_ptr<specific_plain_device_storage>& out_process_devices) :
-        total_process_devices(out_process_devices)
-    {
-    }
-
-    template<class device_t>
-    void operator() (plain_device_container<device_t>& in_container)
-    {
-        //naive merge
-        auto& output_container = std::get<device_t::type_idx()>(*total_process_devices);
-        output_container.insert(output_container.end(), in_container.begin(), in_container.end());
-    }
-    std::shared_ptr<specific_plain_device_storage>& total_process_devices;
-};
-*/
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct printer {
-    template <class device_t>
-    void operator()(const native::indexed_device_container<device_t>& container) {
-        for (const auto& dev : container) {
-            device_rank_descr.insert({ dev.first, dev.second->to_string() });
-        }
-    }
-
-    template <class device_t>
-    void operator()(const native::plain_device_container<device_t>& container) {
-        for (const auto& dev : container) {
-            device_rank_descr.insert(
-                { dev->template get_comm_data<group_id, class_id>().rank, dev->to_string() });
-        }
-    }
-
-    template <class device_t>
-    void operator()(const native::device_t_ptr<device_t>& dev) {
-        if (dev) {
-            device_rank_descr.insert(
-                { dev->template get_comm_data<group_id, class_id>().rank, dev->to_string() });
-        }
-    }
-    /*
-    template<class device_t>
-    void operator() (const native::indexed_device_container<native::ccl_thread_comm<device_t>>& container)
-    {
-        const auto& impl = container.get_read();
-        const auto& inner_map = std::get<0>(impl).get();
-        for(const auto& dev : inner_map)
-        {
-            device_rank_descr.insert({dev.first, dev.second->to_string()});
-        }
-    }
-
-    void operator() (const native::indexed_device_container<native::ccl_thread_comm<native::ccl_virtual_gpu_comm>>& container)
-    {
-        auto ret = container.locker();
-        const auto& impl = container.impl(ret);
-        for(const auto& dev : impl)
-        {
-            device_rank_descr.insert({dev.first, dev.second->to_string()});
-        }
-    }*/
-
-    std::string to_string() const {
-        std::stringstream ss;
-        for (auto val : device_rank_descr) {
-            ss << "idx: " << val.first << "\n" << val.second << std::endl;
-        }
-        return ss.str();
-    }
-    std::map<size_t, std::string> device_rank_descr;
-};
-} // namespace detail
-
-} // namespace native
diff --git a/src/common/comm/l0/device_group_routing_schema.hpp b/src/common/comm/l0/device_group_routing_schema.hpp
deleted file mode 100644
index ddf036c5b..000000000
--- a/src/common/comm/l0/device_group_routing_schema.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <cassert>
-#include <memory>
-#include <sstream>
-#include "oneapi/ccl/types.hpp"
-#include "common/utils/enums.hpp"
-#include "common/utils/tuple.hpp"
-#include "supported_topologies.hpp"
-
-template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
-struct topology_addr {
-    using comm_value_t = size_t;
-    /*using type_idx_t = typename std::underlying_type<ccl::group_split_type>::type;
-    using type_idx_t = typename std::underlying_type<ccl::device_topology_type>::type;
-    static constexpr type_idx_t type_idx()
-    {
-        return static_cast<type_idx_t>(schema_id);
-    }*/
-
-    topology_addr(comm_value_t new_rank, comm_value_t new_size) : rank(new_rank), size(new_size) {}
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << ::to_string(class_id) << ": " << rank << "/" << size;
-        return ss.str();
-    }
-
-    comm_value_t rank;
-    comm_value_t size;
-};
-
-template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
-using topology_addr_ptr = std::unique_ptr<topology_addr<schema_id, class_id>>;
-
-template <ccl::group_split_type group_id, ccl::device_topology_type... class_ids>
-using topology_addr_pointers_tuple_t = std::tuple<topology_addr_ptr<group_id, class_ids>...>;
-
-namespace detail {
-struct topology_printer {
-    template <ccl::group_split_type type, ccl::device_topology_type... class_ids>
-    void operator()(const topology_addr_pointers_tuple_t<type, class_ids...>& topology) {
-        detail::topology_printer p;
-        ccl_tuple_for_each(topology, p);
-        result << ::to_string(type) << "\n\t{ ";
-        result << p.result.str() << " }";
-        result << std::endl;
-    }
-
-    template <ccl::group_split_type type, ccl::device_topology_type class_id>
-    void operator()(const topology_addr_ptr<type, class_id>& topology) {
-        if (topology) {
-            result << topology->to_string();
-        }
-        else {
-            result << to_string(class_id) << ": EMPTY";
-        }
-        result << ", ";
-    }
-
-    std::stringstream result;
-};
-} // namespace detail
-
-struct aggregated_topology_addr {
-    template <ccl::group_split_type schema_id,
-              ccl::device_topology_type class_id,
-              class... SchemaArgs>
-    bool insert(SchemaArgs&&... args) {
-        if (std::get<utils::enum_to_underlying(class_id)>(
-                std::get<utils::enum_to_underlying(schema_id)>(web))) {
-            assert(false && "Topology is registered already");
-            return false;
-        }
-        auto& schema_ptr = std::get<utils::enum_to_underlying(class_id)>(
-            std::get<utils::enum_to_underlying(schema_id)>(web));
-        schema_ptr.reset(new topology_addr<schema_id, class_id>(std::forward<SchemaArgs>(args)...));
-        return true;
-    }
-
-    template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
-    const topology_addr<schema_id, class_id>& get() const {
-        const auto& schema_ptr = std::get<utils::enum_to_underlying(class_id)>(
-            std::get<utils::enum_to_underlying(schema_id)>(web));
-        if (!schema_ptr) {
-            assert(false && "Topology is not registered");
-            throw std::runtime_error("Invalid communication topology");
-        }
-        return *schema_ptr;
-    }
-
-    template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
-    std::string to_string() const {
-        detail::topology_printer p;
-        p(std::get<utils::enum_to_underlying(schema_id)>(web));
-        return p.result.str();
-    }
-
-    template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
-    bool is_registered() const {
-        return std::get<utils::enum_to_underlying(class_id)>(
-            std::get<utils::enum_to_underlying(schema_id)>(web));
-    }
-
-    std::string to_string() const {
-        detail::topology_printer p;
-        ccl_tuple_for_each(web, p);
-        return p.result.str();
-    }
-
-    template <ccl::group_split_type... types>
-    using topology_addr_storage_t =
-        std::tuple<topology_addr_pointers_tuple_t<types, SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>...>;
-
-    using aggregated_topology_addr_storage_t =
-        topology_addr_storage_t<SUPPORTED_HW_TOPOLOGIES_DECL_LIST>;
-
-    aggregated_topology_addr_storage_t web;
-};
diff --git a/src/common/comm/l0/device_types.hpp b/src/common/comm/l0/device_types.hpp
deleted file mode 100644
index 6da85d8b4..000000000
--- a/src/common/comm/l0/device_types.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <string>
-#include <type_traits>
-#include "common/utils/enums.hpp"
-
-namespace native {
-enum class gpu_types : size_t {
-    REAL_GPU,
-    VIRTUAL_GPU,
-    CONCURRENT_GPU,
-    CONCURRENT_REAL_GPU = CONCURRENT_GPU,
-    CONCURRENT_VIRT_GPU,
-
-    IPC_GPU,
-    IPC_SOURCE_REAL_GPU = IPC_GPU,
-    IPC_SOURCE_VIRT_GPU,
-    IPC_DESTINATION_GPU,
-
-    SCALING_PROXY_GPU_TYPES,
-    NUMA_PROXY_GPU_TYPES = SCALING_PROXY_GPU_TYPES,
-
-    NUMA_PROXY_REAL_GPU = NUMA_PROXY_GPU_TYPES,
-    NUMA_PROXY_VIRTUAL_GPU,
-
-    SCALE_UP_GPU_TYPES,
-    SCALE_UP_REAL_GPU = SCALE_UP_GPU_TYPES,
-    SCALE_UP_VIRTUAL_GPU,
-
-    MIX_SCALE_UP_NUMA_TYPES,
-    MIX_SCALE_UP_NUMA_REAL = MIX_SCALE_UP_NUMA_TYPES,
-    MIX_SCALE_UP_NUMA_VIRTUAL,
-
-    SCALE_OUT_GPU_TYPES,
-    SCALE_OUT_REAL_GPU = SCALE_OUT_GPU_TYPES,
-    SCALE_OUT_VIRTUAL_GPU,
-
-    MIX_SCALE_OUT_NUMA_TYPES,
-    MIX_SCALE_OUT_NUMA_REAL = MIX_SCALE_OUT_NUMA_TYPES,
-    MIX_SCALE_OUT_NUMA_VIRTUAL,
-
-    MIX_SCALE_OUT_SCALE_UP_TYPES,
-    MIX_SCALE_OUT_SCALE_UP_REAL = MIX_SCALE_OUT_SCALE_UP_TYPES,
-    MIX_SCALE_OUT_SCALE_UP_VIRTUAL,
-
-    MIX_UNIVERSAL_TYPES,
-    MIX_UNIVERSAL_REAL = MIX_UNIVERSAL_TYPES,
-    MIX_UNIVERSAL_VIRTUAL,
-
-    MAX_TYPE
-};
-
-using gpu_type_names =
-    ::utils::enum_to_str<static_cast<typename std::underlying_type<gpu_types>::type>(
-        gpu_types::MAX_TYPE)>;
-inline std::string to_string(gpu_types type) {
-    return gpu_type_names({ "REAL_GPU",
-                            "VIRTUAL_GPU",
-                            "CONCURRENT_REAL_GPU",
-                            "CONCURRENT_VIRT_GPU",
-                            "SOURCE_IPC_REAL_GPU",
-                            "SOURCE_IPC_VIRT_GPU",
-                            "DESTINATION_IPC_GPU",
-                            "NUMA_REAL_PROXY",
-                            "NUMA_VIRT_PROXY",
-                            "SUP_REAL_PROXY",
-                            "SUP_VIRT_PROXY",
-                            "MIX_SUP_NUMA_REAL",
-                            "MIX_SUP_NUMA_VIRTUAL",
-                            "SOUT_REAL_PROXY",
-                            "SOUT_VIRT_PROXY",
-                            "MIX_SOUT_NUMA_REAL",
-                            "MIX_SIUT_NUMA_VIRT",
-                            "MIX_SOUT_SUP_REAL",
-                            "MIX_SOUT_SUP_VIRT",
-                            "MIX_UNIVERSAL_REAL",
-                            "MIX_UNIVERSAL_VIRT" })
-        .choose(type, "INVALID_VALUE");
-}
-
-constexpr inline gpu_types operator+(gpu_types a,
-                                     typename std::underlying_type<gpu_types>::type b) {
-    return static_cast<gpu_types>(static_cast<typename std::underlying_type<gpu_types>::type>(a) +
-                                  static_cast<typename std::underlying_type<gpu_types>::type>(b));
-}
-
-// devices
-template <class device_t>
-using device_t_ptr = std::shared_ptr<device_t>;
-} // namespace native
diff --git a/src/common/comm/l0/device_types_fwd.hpp b/src/common/comm/l0/device_types_fwd.hpp
deleted file mode 100644
index a6c1a1896..000000000
--- a/src/common/comm/l0/device_types_fwd.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/device_types.hpp"
-
-namespace native {
-
-class ccl_gpu_comm;
-class ccl_ipc_gpu_comm;
-class ccl_virtual_gpu_comm;
-template <class device_t>
-class ccl_thread_comm;
-template <class device_t>
-class ccl_ipc_source_gpu_comm;
-template <class device_t>
-class ccl_numa_proxy;
-template <class device_t>
-class ccl_gpu_scaleup_proxy;
-template <class device_t>
-class ccl_scaleout_proxy;
-
-#define SUPPORTED_DEVICES_DECL_LIST \
-    ccl_gpu_comm, ccl_virtual_gpu_comm, ccl_thread_comm<ccl_gpu_comm>, \
-        ccl_thread_comm<ccl_virtual_gpu_comm>, ccl_ipc_source_gpu_comm<ccl_gpu_comm>, \
-        ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>, ccl_ipc_gpu_comm, \
-        ccl_numa_proxy<ccl_gpu_comm>, ccl_numa_proxy<ccl_virtual_gpu_comm>, \
-        ccl_gpu_scaleup_proxy<ccl_gpu_comm>, ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>, \
-        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>, \
-        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>, \
-        ccl_scaleout_proxy<ccl_gpu_comm>, ccl_scaleout_proxy<ccl_virtual_gpu_comm>, \
-        ccl_scaleout_proxy<ccl_numa_proxy<ccl_gpu_comm>>, \
-        ccl_scaleout_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>, \
-        ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_gpu_comm>>, \
-        ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>>, \
-        ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>>, \
-        ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>>
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
deleted file mode 100644
index d00a5dde4..000000000
--- a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-
-namespace native {
-
-//Adapter for different thread devices
-template <class device_t>
-class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>,
-                                                 gpu_types::CONCURRENT_GPU + device_t::type_idx()> {
-public:
-    using base = ccl_gpu_base_comm<ccl_thread_comm<device_t>,
-                                   gpu_types::CONCURRENT_GPU + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-    using impl_t = device_t;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    static constexpr const char* name_impl() {
-        return "CONCURRENT_GPU";
-    }
-
-    ccl_thread_comm(ccl_device& assigned_device,
-                    typename base::comm_rank_t idx,
-                    device_t& next_thread_device)
-            : base(assigned_device, idx),
-              next_thread_gpu_comm(next_thread_device) {}
-
-    ~ccl_thread_comm() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + next_thread_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return next_thread_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        return next_thread_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(
-            params);
-    }
-
-    device_t& get_impl_device() {
-        return next_thread_gpu_comm;
-    }
-
-private:
-    device_t& next_thread_gpu_comm;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_base_comm.cpp b/src/common/comm/l0/devices/ccl_gpu_base_comm.cpp
deleted file mode 100644
index 4d1d70a00..000000000
--- a/src/common/comm/l0/devices/ccl_gpu_base_comm.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-
-namespace native {
-
-ze_command_list_handle_t cmd_list_proxy_base::get() {
-    return cmd_list.get();
-}
-
-ze_command_list_handle_t* cmd_list_proxy_base::get_ptr() {
-    return cmd_list.get_ptr();
-}
-
-// TODO: try to move these level zero calls to a common platform code(e.g. device.cpp file)
-void cmd_list_proxy_base::append_kernel(ze_kernel_handle_t handle, ze_group_count_t* launch_args) {
-    auto res = zeCommandListAppendLaunchKernel(get(), handle, launch_args, nullptr, 0, nullptr);
-    if (res != ZE_RESULT_SUCCESS) {
-        LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", native::to_string(res));
-        throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-    }
-}
-
-void cmd_list_proxy_base::close_and_execute(std::shared_ptr<ccl_context> ctx,
-                                            ze_fence_handle_t fence) {
-    auto res = zeCommandListClose(get());
-    if (res != ZE_RESULT_SUCCESS) {
-        LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(res));
-        throw std::runtime_error("zeCommandListClose failed");
-    }
-
-    auto& cmd_queue = device.get_cmd_queue(ccl_device::get_default_queue_desc(), ctx);
-    LOG_DEBUG(
-        "Execute list:", cmd_list.get(), ", queue: ", cmd_queue.get(), ", go to submit entry");
-    res = zeCommandQueueExecuteCommandLists(cmd_queue.get(), 1, get_ptr(), fence);
-    if (res != ZE_RESULT_SUCCESS) {
-        throw ccl::exception(std::string("cannot execute command list, error: ") +
-                             std::to_string(res));
-    }
-}
-
-void cmd_list_proxy_base::reset() {
-    auto res = zeCommandListReset(get());
-    if (res != ZE_RESULT_SUCCESS) {
-        LOG_ERROR("zeCommandListReset failed, error: ", native::to_string(res));
-        throw std::runtime_error("zeCommandListReset failed");
-    }
-}
-
-ze_fence_handle_t fence_proxy_base::get() const {
-    return fence.get();
-}
-
-ze_result_t fence_proxy_base::query_status() const {
-    auto res = zeFenceQueryStatus(get());
-    // TODO: Should we return some other codes?
-    if (res != ZE_RESULT_SUCCESS && res != ZE_RESULT_NOT_READY) {
-        LOG_ERROR("zeFenceQueryStatus failed, error: ", native::to_string(res));
-        throw std::runtime_error("zeFenceQueryStatus failed");
-    }
-
-    return res;
-}
-
-void fence_proxy_base::reset() {
-    auto res = zeFenceReset(get());
-    if (res != ZE_RESULT_SUCCESS) {
-        LOG_ERROR("zeFenceReset failed, error: ", native::to_string(res));
-        throw std::runtime_error("zeFenceReset failed");
-    }
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp
deleted file mode 100644
index 2ccaeadf6..000000000
--- a/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "coll/algorithms/algorithms_enum.hpp"
-
-#include "common/comm/l0/device_group_routing_schema.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-
-#include "common/comm/l0/modules/ring/allgatherv_entry_module.hpp"
-#include "common/comm/l0/modules/ring/allreduce_entry_module.hpp"
-#include "common/comm/l0/modules/ring/alltoallv_entry_module.hpp"
-#include "common/comm/l0/modules/ring/bcast_entry_module.hpp"
-#include "common/comm/l0/modules/ring/reduce_entry_module.hpp"
-#include "common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp"
-
-#include "common/comm/l0/modules/a2a/allreduce_module.hpp"
-#include "common/comm/l0/modules/supported_modules.hpp"
-
-#include "common/comm/l0/modules/modules_source_data.hpp"
-#include "common/comm/l0/gpu_comm_utils.hpp"
-
-namespace native {
-
-// Proxy classes to encapsulate command list and fence handles and provide a simplified interface on
-// top of them.
-// Here we have only "base" classes with a basic functionality(e.g. no thread-safety, just raw L0 calls)
-// and they're returned by methods of ccl_gpu_base_comm while its childs can override
-// and return the extended proxy objects with the same interface(see ccl_gpu_base_comm for example)
-class cmd_list_proxy_base {
-protected:
-    ccl_device& device;
-    ccl_device::device_cmd_list& cmd_list;
-
-public:
-    cmd_list_proxy_base(ccl_device& device, ccl_device::device_cmd_list& cmd_list)
-            : device{ device },
-              cmd_list{ cmd_list } {}
-
-    cmd_list_proxy_base(const cmd_list_proxy_base& other)
-            : device{ other.device },
-              cmd_list{ other.cmd_list } {}
-
-    cmd_list_proxy_base& operator=(const cmd_list_proxy_base& other) = delete;
-
-    ze_command_list_handle_t get();
-    ze_command_list_handle_t* get_ptr();
-
-    void append_kernel(ze_kernel_handle_t handle, ze_group_count_t* launch_args);
-    void close_and_execute(std::shared_ptr<ccl_context> ctx, ze_fence_handle_t fence);
-    void reset();
-};
-
-class fence_proxy_base {
-protected:
-    ccl_device& device;
-    ccl_device::device_queue_fence& fence;
-
-public:
-    fence_proxy_base(ccl_device& device, ccl_device::device_queue_fence& fence)
-            : device{ device },
-              fence{ fence } {}
-
-    fence_proxy_base(const fence_proxy_base& other)
-            : device{ other.device },
-              fence{ other.fence } {}
-
-    fence_proxy_base& operator=(const fence_proxy_base& other) = delete;
-
-    ze_fence_handle_t get() const;
-
-    ze_result_t query_status() const;
-    void reset();
-};
-
-template <class gpu_impl, gpu_types type>
-class ccl_gpu_base_comm {
-public:
-    using comm_rank_t = int;
-    using type_idx_t = typename std::underlying_type<gpu_types>::type;
-    ccl_gpu_base_comm(ccl_device& assigned_device, comm_rank_t idx)
-            : index_in_group(idx),
-              device(assigned_device) {}
-
-    ~ccl_gpu_base_comm() = default;
-
-    gpu_impl* get_this() {
-        return static_cast<gpu_impl*>(this);
-    }
-
-    const gpu_impl* get_this() const {
-        return static_cast<const gpu_impl*>(this);
-    }
-
-    static constexpr const char* name() {
-        return gpu_impl::name_impl();
-    }
-
-    std::string to_string() const {
-        return get_this()->to_string_impl();
-    }
-
-    static constexpr type_idx_t type_idx() {
-        return static_cast<type_idx_t>(type);
-    }
-
-    ccl_device& get_device() {
-        return device;
-    }
-
-    [[deprecated]] comm_rank_t get_index_in_group() const {
-        return index_in_group;
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    bool reset_rank(comm_rank_t new_rank, comm_rank_t new_size) {
-        rank = new_rank;
-        size = new_size;
-        return device_routing_web.insert<group_id, class_id>(new_rank,
-                                                             new_size); //consider inheritance
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    const topology_addr<group_id, class_id>& get_comm_data() const {
-        return device_routing_web.get<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    bool is_registered() const {
-        return device_routing_web.is_registered<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    std::string comm_to_str() const {
-        return device_routing_web.to_string<group_id, class_id>();
-    }
-
-    std::string comm_to_str() const {
-        return device_routing_web.to_string();
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
-              class module_impl>
-    static std::shared_ptr<module_impl<module_type, group_id, class_id>>& get_gpu_module_unsafe(
-        supported_device_modules<module_impl>& modules) {
-        return std::get<::utils::enum_to_underlying(class_id)>(
-            std::get<::utils::enum_to_underlying(group_id)>(std::get<module_type>(modules)));
-    }
-
-    cmd_list_proxy_base get_cmd_list(
-        std::shared_ptr<ccl_context> ctx,
-        const ze_command_list_desc_t& properties = ccl_device::get_default_list_desc()) {
-        auto& cmd_list = device.get_cmd_list(ctx, properties);
-        return cmd_list_proxy_base(device, cmd_list);
-    }
-
-    fence_proxy_base get_fence(const ccl_device::device_queue& cmd_queue,
-                               std::shared_ptr<ccl_context> ctx) {
-        auto& fence = device.get_fence(cmd_queue, ctx);
-        return fence_proxy_base(device, fence);
-    }
-
-protected:
-    size_t index_in_group;
-
-    aggregated_topology_addr device_routing_web;
-    ccl_device& device;
-
-    mutable int rank; //TODO
-    mutable int size; //TODO
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_gpu_comm.cpp
deleted file mode 100644
index 9b8e94c46..000000000
--- a/src/common/comm/l0/devices/ccl_gpu_comm.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <iostream>
-#include <vector>
-#include <set>
-#include "common/comm/l0/devices/ccl_gpu_comm.hpp"
-#include "sched/sched.hpp"
-// #include "sched/entry/l0/l0_entry.hpp"
-#include "common/comm/l0/modules/specific_modules_source_data.hpp"
-
-namespace native {
-
-void cmd_list_proxy::append_kernel(ze_kernel_handle_t handle, ze_group_count_t* launch_args) {
-    std::lock_guard<std::mutex> lg(comm.cmd_list_mutex);
-    base::append_kernel(handle, launch_args);
-}
-
-bool cmd_list_proxy::close_and_execute(std::shared_ptr<ccl_context> ctx, ze_fence_handle_t fence) {
-    auto& ref = comm.cmd_list_close_ref_count;
-
-    if (--ref == 0) {
-        LOG_DEBUG("Closing list and executing on the queue");
-        // Technically close operation requires a synchronization, but due to ref count semantic we
-        // do it only once even if multiple threads are executing.
-        base::close_and_execute(ctx, fence);
-        ref = get_init_count();
-        return true;
-    }
-    else {
-        LOG_DEBUG("Skip list close, ref count: ", ref);
-        return false;
-    }
-}
-
-void cmd_list_proxy::reset() {
-    auto& ref = comm.cmd_list_reset_ref_count;
-    if (--ref == 0) {
-        base::reset();
-        // Once we reset the list, set the initial value as a ref counter
-        // so it can be re-used later
-        ref = get_init_count();
-    }
-}
-
-int cmd_list_proxy::get_init_count() const {
-    return static_cast<int>(comm.registered_virtual_gpu_count + 1);
-}
-
-void fence_proxy::reset() {
-    auto& ref = comm.fence_reset_ref_count;
-
-    if (--ref == 0) {
-        base::reset();
-        ref = get_init_count();
-    }
-}
-
-int fence_proxy::get_init_count() const {
-    return static_cast<int>(comm.registered_virtual_gpu_count + 1);
-}
-
-ccl_gpu_comm::ccl_gpu_comm(ccl_device& assigned_device, comm_rank_t idx)
-        : base(assigned_device, idx),
-          // Count this "real" device
-          cmd_list_reset_ref_count{ 1 },
-          cmd_list_close_ref_count{ 1 },
-          fence_reset_ref_count{ 1 } {
-    auto queue_prop = ccl_device::get_default_queue_desc();
-    queue_prop.ordinal = 0;
-    std::shared_ptr<ccl_context> ctx;
-    (void)device.get_cmd_queue(queue_prop, ctx); //default -for execution
-
-    //compile and load modules from all sources
-    load_modules(specific_modules_source_data_storage::instance());
-}
-
-std::string ccl_gpu_comm::to_string_impl() const {
-    std::string ret(name());
-    ret = ret + ", comm:\n" + comm_to_str() +
-          ", virtual count: " + std::to_string(get_virtual_gpu_count());
-    return ret;
-}
-
-void ccl_gpu_comm::register_virtual_gpu(ccl_virtual_gpu_comm* gpu) {
-    registered_virtual_gpu_count++;
-    // Increment ref counters each time we register a virtual device, them must be equal to the total number of
-    // virtual devices + 1 for real one
-    cmd_list_reset_ref_count++;
-    fence_reset_ref_count++;
-    cmd_list_close_ref_count++;
-}
-
-std::tuple<bool, ze_module_handle_t, std::string> ccl_gpu_comm::create_module_handle(
-    const ze_module_desc_t& descr,
-    size_t hash) {
-    std::tuple<bool, ze_module_handle_t, std::string> ret{ true, nullptr, "" };
-    std::shared_ptr<ccl_context> ctx;
-
-    native::ccl_device::device_module_ptr mod;
-    try {
-        mod = device.create_module(descr, hash, ctx);
-        std::get<1>(ret) = mod->get();
-    }
-    catch (const std::exception& ex) {
-        std::get<0>(ret) = false;
-        std::get<2>(ret) = ex.what();
-    }
-
-    return ret;
-}
-
-cmd_list_proxy ccl_gpu_comm::get_cmd_list(std::shared_ptr<ccl_context> ctx,
-                                          const ze_command_list_desc_t& properties) {
-    auto& cmd_list = device.get_cmd_list(ctx, properties);
-    // TODO: add dynamic dispatch in case we don't have any registered virtual devices?
-    return cmd_list_proxy(device, cmd_list, *this);
-}
-
-fence_proxy ccl_gpu_comm::get_fence(const ccl_device::device_queue& cmd_queue,
-                                    std::shared_ptr<ccl_context> ctx) {
-    auto& fence = device.get_fence(cmd_queue, ctx);
-    return fence_proxy(device, fence, *this);
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_comm.hpp
deleted file mode 100644
index e201db630..000000000
--- a/src/common/comm/l0/devices/ccl_gpu_comm.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-
-namespace native {
-
-// Proxy classes to encapsulate command list and fence handles. The extend the base classes to adding
-// thread-safety and refernece-counting semantic for L0 calls. The main purpose of this is to enable
-// multiple device emulation which is done by using "real" and "virtual" devices while using the same
-// underlying command list and command queue for the same hardware device
-// (this is sort of a workaround for a L0 limitation since submitting kernels into separate multiple
-// queues on the same device doesn't provide forward progress guarantee required by our kernels)
-// So reference-counting semantic allows to provide a simple interface and enforce the correct
-// order of L0 calls(i.e. some calls should be called only once while others - multiple times per logical device)
-//
-// They require an external state stored in ccl_gpu_comm(mutex and several atomic variables for reference
-// counting). The initial value of these ref counts is equal to the number of attached "virtual" devices + 1 for
-// "real" one(e.g. every time we register a "virtual" device the value is incremented by 1). On each call(e.g.
-// reset) we decrement the value and once it's 0, execute it and them restore to the same initial value, so
-// the state can be reused for further collective launches.
-class cmd_list_proxy : public cmd_list_proxy_base {
-private:
-    using base = cmd_list_proxy_base;
-    ccl_gpu_comm& comm;
-
-public:
-    cmd_list_proxy(ccl_device& device, ccl_device::device_cmd_list& cmd_list, ccl_gpu_comm& comm)
-            : base(device, cmd_list),
-              comm{ comm } {}
-
-    cmd_list_proxy(const cmd_list_proxy& other)
-            : base(other.device, other.cmd_list),
-              comm{ other.comm } {}
-
-    cmd_list_proxy& operator=(const cmd_list_proxy& other) = delete;
-
-    void append_kernel(ze_kernel_handle_t handle, ze_group_count_t* launch_args);
-    bool close_and_execute(std::shared_ptr<ccl_context> ctx, ze_fence_handle_t fence);
-    void reset();
-
-private:
-    int get_init_count() const;
-};
-
-class fence_proxy : public fence_proxy_base {
-private:
-    using base = fence_proxy_base;
-
-    ccl_gpu_comm& comm;
-
-public:
-    fence_proxy(ccl_device& device, ccl_device::device_queue_fence& fence, ccl_gpu_comm& comm)
-            : base{ device, fence },
-              comm{ comm } {}
-
-    fence_proxy(const fence_proxy& other) : base{ other.device, other.fence }, comm{ other.comm } {}
-    fence_proxy& operator=(const fence_proxy& other) = delete;
-
-    void reset();
-
-private:
-    int get_init_count() const;
-};
-
-class ccl_virtual_gpu_comm;
-class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>,
-                     public module_loader<ccl_gpu_comm> {
-public:
-    using base = ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>;
-    using base::comm_rank_t;
-    using impl_t = ccl_gpu_comm;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t = device_coll_module<algo_type, group, mode>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    using supported_modules = supported_device_modules<gpu_module_t>;
-
-    static constexpr const char* name_impl() {
-        return "REAL_GPU";
-    }
-
-    ccl_gpu_comm(ccl_device& assigned_device, comm_rank_t group_rank_idx);
-    ~ccl_gpu_comm() = default;
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_module_t<module_type, group_id, class_id>& get_gpu_module() {
-        auto& ptr =
-            base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
-                registered_modules);
-        assert(ptr);
-        return *ptr;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    std::shared_ptr<gpu_module_t<module_type, group_id, class_id>> get_gpu_module_ptr() {
-        return base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
-            registered_modules);
-    }
-
-    std::string to_string_impl() const;
-
-    // template <ccl_coll_type module_type,
-    //           ccl::group_split_type group_id,
-    //           ccl::device_topology_type class_id,
-    //           class kernel_params>
-    // gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel() {
-    //     auto& ptr = get_gpu_module<module_type, group_id, class_id>();
-
-    //     using requested_class = kernel_class_t<module_type, group_id, class_id>;
-    //     return ptr.template get_class<requested_class>().template get<kernel_params>();
-    // }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        auto& ptr = get_gpu_module<module_type, group_id, class_id>();
-
-        using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().get(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size); //threads count!!!
-        return main_func;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    std::string create_module_impl(const ze_module_desc_t& module_data) {
-        bool ret;
-        ze_module_handle_t handle;
-        std::string descr;
-
-        size_t module_hash_val = module_hash(module_type, group_id, class_id);
-        LOG_DEBUG("Module hash for \"",
-                  ccl_coll_type_to_str(module_type),
-                  "\", \"",
-                  ::to_string(group_id),
-                  "\", \"",
-                  ::to_string(class_id),
-                  "\", is: ",
-                  module_hash_val);
-        std::tie(ret, handle, descr) = create_module_handle(module_data, module_hash_val);
-        if (!ret) {
-            std::string err_str;
-            {
-                std::stringstream str;
-                ccl_logger::format(str,
-                                   "Cannot create module for:",
-                                   name_impl(),
-                                   " on device: ",
-                                   device.get_device_properties().deviceId,
-                                   ", error: ",
-                                   descr);
-                err_str = str.str();
-            }
-            LOG_ERROR(err_str);
-            throw ccl::exception(err_str);
-        }
-        std::get<::utils::enum_to_underlying(class_id)>(
-            std::get<::utils::enum_to_underlying(group_id)>(
-                std::get<module_type>(registered_modules)))
-            .reset(new gpu_module_t<module_type, group_id, class_id>(handle));
-        return descr;
-    }
-
-    void register_virtual_gpu(ccl_virtual_gpu_comm* gpu);
-    size_t get_virtual_gpu_count() const {
-        return registered_virtual_gpu_count;
-    }
-
-    cmd_list_proxy get_cmd_list(std::shared_ptr<ccl_context> ctx,
-                                const ze_command_list_desc_t& properties);
-
-    fence_proxy get_fence(const ccl_device::device_queue& cmd_queue,
-                          std::shared_ptr<ccl_context> ctx);
-
-    friend class cmd_list_proxy;
-    friend class fence_proxy;
-
-protected:
-    supported_modules registered_modules;
-    size_t registered_virtual_gpu_count = 0;
-
-    std::atomic<int> cmd_list_reset_ref_count;
-    std::atomic<int> cmd_list_close_ref_count;
-    std::atomic<int> fence_reset_ref_count;
-    std::mutex cmd_list_mutex;
-
-private:
-    std::tuple<bool, ze_module_handle_t, std::string> create_module_handle(
-        const ze_module_desc_t& descr,
-        size_t hash);
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
deleted file mode 100644
index 10fd7b51f..000000000
--- a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-#include "common/comm/l0/devices/proxy_observer_types.hpp"
-
-namespace native {
-
-// Base scale-up adapter for different thread devices
-template <class device_t>
-class ccl_gpu_scaleup_proxy
-        : public ccl_gpu_base_comm<ccl_gpu_scaleup_proxy<device_t>,
-                                   gpu_types::SCALE_UP_GPU_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_gpu_scaleup_proxy<device_t>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_gpu_scaleup_proxy<device_t>,
-                                   gpu_types::SCALE_UP_GPU_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<ccl_gpu_scaleup_proxy<device_t>>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
-
-    static constexpr const char* name_impl() {
-        return "SCALE_UP_GPU_PROXY";
-    }
-
-    ccl_gpu_scaleup_proxy(ccl_device& assigned_device,
-                          typename base::comm_rank_t idx,
-                          device_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_gpu_scaleup_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        this->template invoke<group_id, class_id>();
-
-        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-private:
-    device_t& wrapped_gpu_comm;
-};
-
-//specialization for mix class NUMA
-template <class device_t>
-class ccl_numa_proxy;
-
-template <class device_t>
-class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>
-        : public ccl_gpu_base_comm<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_UP_NUMA_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_UP_NUMA_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>;
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type,
-                                                 group_id,
-                                                 class_id>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
-    using device_impl_t = ccl_numa_proxy<device_t>;
-
-    static constexpr const char* name_impl() {
-        return "MIX_SCALE_UP_NUMA";
-    }
-
-    ccl_gpu_scaleup_proxy(ccl_device& assigned_device,
-                          typename base::comm_rank_t idx,
-                          device_impl_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_gpu_scaleup_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        this->template invoke<group_id>();
-        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-private:
-    device_impl_t& wrapped_gpu_comm;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
deleted file mode 100644
index 716cd4b1d..000000000
--- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <iostream>
-#include <vector>
-#include <set>
-#include "common/comm/l0/devices/ccl_ipc_gpu_comm.hpp"
-#include "sched/sched.hpp"
-// #include "sched/entry/l0/l0_entry.hpp"
-#include "common/comm/l0/modules/specific_modules_source_data.hpp"
-
-namespace native {
-
-ccl_ipc_gpu_comm::ccl_ipc_gpu_comm(ccl_device& assigned_device,
-                                   int idx,
-                                   int size,
-                                   ccl::group_split_type topology_type,
-                                   ccl::device_topology_type class_id)
-        : base(assigned_device, idx) {
-    /* No queue or other device-related primitives creation
-     * Because related device belong to the different processes
-     */
-    //compile and load modules from all sources
-    load_modules(specific_modules_source_data_storage::instance());
-
-    //register in topology
-    switch (topology_type) {
-        case ccl::group_split_type::cluster: {
-            switch (class_id) {
-                case ccl::device_topology_type::ring: {
-                    /* -S-
-                    reset_rank<ccl::group_split_type::cluster,
-                               ccl::device_topology_type::ring>(idx, size);
-                    */
-                    break;
-                }
-                case ccl::device_topology_type::a2a: {
-                    /* -S-
-                    reset_rank<ccl::group_split_type::cluster,
-                               ccl::device_topology_type::a2a>(idx, size);
-                    */
-                    break;
-                }
-                default: {
-                    throw std::runtime_error(std::string("ccl_ipc_gpu_comm must be created") +
-                                             " unknown topology class: " + ::to_string(class_id));
-                }
-            }
-
-            break;
-        }
-        default: {
-            throw std::runtime_error(
-                std::string("ccl_ipc_gpu_comm must be created") +
-                "for process-based topology, but requested: " + ::to_string(topology_type));
-        }
-    }
-
-    LOG_DEBUG("Created ", name_impl(), ", addr: ", reinterpret_cast<void*>(this));
-}
-
-ccl_ipc_gpu_comm::~ccl_ipc_gpu_comm() {
-    LOG_DEBUG("Destroyed ", name_impl(), ", addr: ", reinterpret_cast<void*>(this));
-}
-
-ccl_ipc_gpu_comm::supported_modules& ccl_ipc_gpu_comm::get_registered_modules() {
-    return registered_modules;
-}
-
-std::string ccl_ipc_gpu_comm::to_string_impl() const {
-    std::string ret(name_impl());
-    ret = ret + ", comm:\n" + comm_to_str();
-    return ret;
-}
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
deleted file mode 100644
index 10153d131..000000000
--- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <initializer_list>
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-#include "common/comm/l0/devices/proxy_observer_types.hpp"
-
-#include "common/comm/l0/devices/communication_structs/ipc_server.hpp"
-
-namespace native {
-class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::IPC_DESTINATION_GPU>,
-                         public module_loader<ccl_ipc_gpu_comm>,
-                         public proxy_multiple_observer<ccl_ipc_gpu_comm,
-                                                        std::nullptr_t,
-                                                        std::nullptr_t,
-                                                        process_group_context>,
-                         public net::ipc_server {
-public:
-    using base = ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::IPC_DESTINATION_GPU>;
-
-    using proxy_base = proxy_multiple_observer<ccl_ipc_gpu_comm,
-                                               std::nullptr_t,
-                                               std::nullptr_t,
-                                               process_group_context>;
-    using base::comm_rank_t;
-    using impl_t = ccl_ipc_gpu_comm;
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t = ipc_dst_device_coll_module<algo_type, group, mode>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    using supported_modules = supported_device_modules<gpu_module_t>;
-
-    static constexpr const char* name_impl() {
-        return "DESTINATION_IPC_GPU";
-    }
-
-    ccl_ipc_gpu_comm(ccl_device& assigned_device,
-                     comm_rank_t idx,
-                     int size,
-                     ccl::group_split_type group_id,
-                     ccl::device_topology_type class_id);
-    ~ccl_ipc_gpu_comm();
-
-    std::string to_string_impl() const;
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        auto& ptr =
-            base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
-                registered_modules);
-        assert(ptr);
-
-        using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr->template get_class<requested_class>().get(params);
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    std::string create_module_impl(const ze_module_desc_t& module_data) {
-        std::get<::utils::enum_to_underlying(class_id)>(
-            std::get<::utils::enum_to_underlying(group_id)>(
-                std::get<module_type>(registered_modules)))
-            .reset(new gpu_module_t<module_type, group_id, class_id>(nullptr));
-        return { "IPC module storage" };
-    }
-
-    supported_modules& get_registered_modules();
-
-private:
-    supported_modules registered_modules;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
deleted file mode 100644
index cc39a0084..000000000
--- a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-#include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_session_key.hpp"
-
-#include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
-namespace native {
-
-//Adapter for different thread devices
-template <class device_t>
-class ccl_ipc_source_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_source_gpu_comm<device_t>,
-                                                         gpu_types::IPC_GPU + device_t::type_idx()>,
-                                public proxy_multiple_observer<ccl_ipc_source_gpu_comm<device_t>,
-                                                               std::nullptr_t,
-                                                               std::nullptr_t,
-                                                               process_group_context>,
-                                public net::ipc_client {
-public:
-    using base = ccl_gpu_base_comm<ccl_ipc_source_gpu_comm<device_t>,
-                                   gpu_types::IPC_GPU + device_t::type_idx()>;
-
-    using proxy_base = proxy_multiple_observer<ccl_ipc_source_gpu_comm<device_t>,
-                                               std::nullptr_t,
-                                               std::nullptr_t,
-                                               process_group_context>;
-    using typename base::comm_rank_t;
-    using impl_t = device_t;
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    static constexpr const char* name_impl() {
-        return "SOURCE_IPC_GPU";
-    }
-
-    ccl_ipc_source_gpu_comm(ccl_device& assigned_device,
-                            typename base::comm_rank_t idx,
-                            device_t& process_device,
-                            ccl::group_split_type group_id,
-                            ccl::device_topology_type class_id)
-            : base(assigned_device, idx),
-              inprocess_gpu_comm(process_device) {
-        //register in topology
-        switch (group_id) {
-            case ccl::group_split_type::cluster: {
-                switch (class_id) {
-                    case ccl::device_topology_type::ring: {
-                        const auto& original_rank =
-                            inprocess_gpu_comm
-                                .template get_comm_data<ccl::group_split_type::cluster,
-                                                        ccl::device_topology_type::ring>();
-                        base::template reset_rank<ccl::group_split_type::cluster,
-                                                  ccl::device_topology_type::ring>(
-                            original_rank.rank, original_rank.size);
-                        break;
-                    }
-                    case ccl::device_topology_type::a2a: {
-                        const auto& original_rank =
-                            inprocess_gpu_comm
-                                .template get_comm_data<ccl::group_split_type::cluster,
-                                                        ccl::device_topology_type::a2a>();
-                        base::template reset_rank<ccl::group_split_type::cluster,
-                                                  ccl::device_topology_type::a2a>(
-                            original_rank.rank, original_rank.size);
-                        break;
-                    }
-                    default: {
-                        throw std::runtime_error(
-                            std::string("ccl_ipc_source_gpu_comm must be created") +
-                            " unknown topology class: " + std::to_string(class_id));
-                    }
-                }
-                break;
-            }
-            default: {
-                throw std::runtime_error(
-                    std::string("ccl_ipc_source_gpu_comm must be created") +
-                    "for process-based topology, but requested: " +
-                    std::to_string(
-                        static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
-                            group_id)));
-            }
-        }
-    }
-
-    ~ccl_ipc_source_gpu_comm() = default;
-
-    //TODO L0 work
-    device_t& get_impl() {
-        return inprocess_gpu_comm;
-    }
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + inprocess_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-    /*
-    template<ccl::group_split_type group_id>
-    topology_addr<group_id> get_comm_data() const
-    {
-        return inprocess_gpu_comm.template get_comm_data<group_id>();
-    }
-*/
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        return inprocess_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        static_assert(group_id == ccl::group_split_type::cluster,
-                      "ccl_ipc_source_gpu_comm available for ccl::group_split_type::cluster only");
-        const topology_addr<group_id, class_id>& comm_addr =
-            inprocess_gpu_comm.template get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-
-        ipc_invoke_params<gpu_entry::type()> params(entry.get_ipc_data(), entry.get_params());
-        this->template invoke<group_id, class_id>(entry.get_ipc_session_key(), std::move(params));
-
-        return main_func;
-    }
-
-private:
-    device_t& inprocess_gpu_comm;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_numa_proxy.hpp b/src/common/comm/l0/devices/ccl_numa_proxy.hpp
deleted file mode 100644
index efd29e93b..000000000
--- a/src/common/comm/l0/devices/ccl_numa_proxy.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-#include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-
-namespace native {
-
-//Adapter for different thread devices
-template <class device_t>
-class ccl_numa_proxy
-        : public ccl_gpu_base_comm<ccl_numa_proxy<device_t>,
-                                   gpu_types::NUMA_PROXY_GPU_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_numa_proxy<device_t>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_numa_proxy<device_t>,
-                                   gpu_types::NUMA_PROXY_GPU_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-    using impl_t = device_t;
-    using proxy_base = proxy_observer_specific<ccl_numa_proxy<device_t>>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::numa_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    static constexpr const char* name_impl() {
-        return "NUMA_PROXY";
-    }
-
-    ccl_numa_proxy(ccl_device& assigned_device,
-                   typename base::comm_rank_t idx,
-                   device_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_numa_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>();
-
-        using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().get(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        static_assert(group_id == ccl::group_split_type::cluster,
-                      "ccl_numa_proxy available for ccl::group_split_type::cluster only");
-
-        const topology_addr<group_id, class_id>& comm_addr =
-            base::template get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>;
-        kernel_func_type& main_func =
-            get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-
-        // alloc shared data structure to notify host side with device parital result
-        observer::invoke_params<gpu_entry::type()> params = entry.get_numa_data();
-
-        // invoke host-side context creation
-        this->template invoke<group_id, class_id>(entry.get_numa_session_key(), params);
-
-        // bind shared data to kernel
-        const auto& out_ctx_params = params.get_ctx_params();
-
-        main_func.bind_data(out_ctx_params);
-
-        return main_func;
-    }
-
-private:
-    device_t& wrapped_gpu_comm;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
deleted file mode 100644
index e08545b53..000000000
--- a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <list>
-#include <set>
-#include <vector>
-
-#include "common/comm/l0/devices/ccl_gpu_base_comm.hpp"
-#include "common/comm/l0/devices/proxy_observer_types.hpp"
-#include "common/comm/l0/context/scale/base/base_session.hpp"
-
-namespace native {
-
-// scale-out adapter for different thread devices
-template <class device_t>
-class ccl_scaleout_proxy
-        : public ccl_gpu_base_comm<ccl_scaleout_proxy<device_t>,
-                                   gpu_types::SCALE_OUT_GPU_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_scaleout_proxy<device_t>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_scaleout_proxy<device_t>,
-                                   gpu_types::SCALE_OUT_GPU_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<ccl_scaleout_proxy<device_t>>;
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type,
-                                                 group_id,
-                                                 class_id>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    static constexpr const char* name_impl() {
-        return "SCALE_OUT_PROXY";
-    }
-
-    ccl_scaleout_proxy(ccl_device& assigned_device,
-                       typename base::comm_rank_t idx,
-                       device_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_scaleout_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        auto& ptr = wrapped_gpu_comm.template get_gpu_module<module_type, group_id, class_id>();
-
-        using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().get(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        using kernel_func_type = gpu_kernel_t<gpu_entry::type(), group_id, class_id>;
-
-        kernel_func_type& main_func =
-            get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-
-        // alloc shared data structure to notify host side with device parital result
-        observer::invoke_params<gpu_entry::type()> params = entry.get_scaleout_data();
-
-        // invoke host-side context creation
-        this->template invoke<group_id, class_id>(entry.get_scaleout_session_key(), params);
-
-        // bind shared data to kernel
-        const auto& out_ctx_params = params.get_ctx_params();
-
-        main_func.bind_data(out_ctx_params);
-
-        return main_func;
-    }
-
-private:
-    device_t& wrapped_gpu_comm;
-};
-
-//TODO Move to different files
-/*****Specializations*****/
-// 1. specialization for mix class NUMA
-template <class device_t>
-class ccl_numa_proxy;
-
-template <class device_t>
-class ccl_scaleout_proxy<ccl_numa_proxy<device_t>>
-        : public ccl_gpu_base_comm<ccl_scaleout_proxy<ccl_numa_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_OUT_NUMA_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_scaleout_proxy<ccl_numa_proxy<device_t>>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_scaleout_proxy<ccl_numa_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_OUT_NUMA_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<ccl_scaleout_proxy<ccl_numa_proxy<device_t>>>;
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type,
-                                                 group_id,
-                                                 class_id>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_kernel_t =
-        typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t;
-
-    //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
-    using device_impl_t = ccl_numa_proxy<device_t>;
-
-    static constexpr const char* name_impl() {
-        return "MIX_SCALE_UP_NUMA";
-    }
-
-    ccl_scaleout_proxy(ccl_device& assigned_device,
-                       typename base::comm_rank_t idx,
-                       device_impl_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_scaleout_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        this->template invoke<group_id>();
-
-        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-private:
-    device_impl_t& wrapped_gpu_comm;
-};
-
-// 2. specialization for mix class scaleUp
-template <class device_t>
-class ccl_gpu_scaleup_proxy;
-
-template <class device_t>
-class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>
-        : public ccl_gpu_base_comm<ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_OUT_SCALE_UP_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>> {
-public:
-    using base = ccl_gpu_base_comm<ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>,
-                                   gpu_types::MIX_SCALE_OUT_SCALE_UP_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>>;
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type,
-                                                 group_id,
-                                                 class_id>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    using gpu_kernel_t =
-        typename gpu_module_t<algo_type, group_id, class_id>::scale_out_cpu_gw_class::kernel_t;
-
-    //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
-    using device_impl_t = ccl_gpu_scaleup_proxy<device_t>;
-
-    static constexpr const char* name_impl() {
-        return "MIX_SOUT_SUP";
-    }
-
-    ccl_scaleout_proxy(ccl_device& assigned_device,
-                       typename base::comm_rank_t idx,
-                       device_impl_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_scaleout_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        this->template invoke<group_id>();
-
-        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-private:
-    device_impl_t& wrapped_gpu_comm;
-};
-
-// 3. specialization for mix class scaleUp-numa
-template <class device_t>
-class ccl_gpu_scaleup_proxy;
-
-template <class device_t>
-class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
-        : public ccl_gpu_base_comm<
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>,
-              gpu_types::MIX_UNIVERSAL_TYPES + device_t::type_idx()>,
-          public proxy_observer_specific<
-              ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>> {
-public:
-    using base =
-        ccl_gpu_base_comm<ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>,
-                          gpu_types::MIX_UNIVERSAL_TYPES + device_t::type_idx()>;
-    using typename base::comm_rank_t;
-
-    using impl_t = device_t;
-
-    using proxy_base = proxy_observer_specific<
-        ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t =
-        typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t =
-        typename gpu_module_t<algo_type, group, mode>::scale_out_cpu_gw_class::kernel_t;
-
-    //using ctx_ptr = std::weak_ptr<scale_up_ctx_t>;
-    using device_impl_t = ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>;
-
-    static constexpr const char* name_impl() {
-        return "MIX_SOUT_SUP_NUMA";
-    }
-
-    ccl_scaleout_proxy(ccl_device& assigned_device,
-                       typename base::comm_rank_t idx,
-                       device_impl_t& process_device)
-            : base(assigned_device, idx),
-              wrapped_gpu_comm(process_device) {}
-
-    ~ccl_scaleout_proxy() = default;
-
-    std::string to_string_impl() const {
-        std::string ret(name_impl());
-        ret = ret + "(" + wrapped_gpu_comm.to_string_impl() + ")";
-        return ret;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        this->template invoke<group_id>();
-
-        return wrapped_gpu_comm.template get_gpu_kernel<module_type, group_id, class_id>(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_comm_data() const {
-        return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class gpu_entry,
-              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-private:
-    device_impl_t& wrapped_gpu_comm;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
deleted file mode 100644
index 6932f4b14..000000000
--- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/devices/ccl_virtual_gpu_comm.hpp"
-#include "common/comm/l0/modules/specific_modules_source_data.hpp"
-
-namespace native {
-ccl_virtual_gpu_comm::ccl_virtual_gpu_comm(ccl_device& device,
-                                           comm_rank_t idx,
-                                           ccl_gpu_comm& real_gpu)
-        : base(device, idx),
-          real_gpu_comm(real_gpu) {
-    //TODO increase reference count
-    real_gpu.register_virtual_gpu(this);
-
-    //compile and load modules from all sources
-    load_modules(specific_modules_source_data_storage::instance());
-}
-
-std::string ccl_virtual_gpu_comm::to_string_impl() const {
-    std::string ret(name_impl());
-    ret = ret + ", comm:\n" + comm_to_str();
-    return ret;
-}
-
-cmd_list_proxy ccl_virtual_gpu_comm::get_cmd_list(std::shared_ptr<ccl_context> ctx,
-                                                  const ze_command_list_desc_t& properties) {
-    return real_gpu_comm.get_cmd_list(ctx, properties);
-}
-
-fence_proxy ccl_virtual_gpu_comm::get_fence(const ccl_device::device_queue& cmd_queue,
-                                            std::shared_ptr<ccl_context> ctx) {
-    return real_gpu_comm.get_fence(cmd_queue, ctx);
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
deleted file mode 100644
index 6334b56a4..000000000
--- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/devices/ccl_gpu_comm.hpp"
-
-namespace native {
-
-class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_types::VIRTUAL_GPU>,
-                             public module_loader<ccl_virtual_gpu_comm> {
-public:
-    using base = ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_types::VIRTUAL_GPU>;
-    using base::comm_rank_t;
-
-    using impl_t = ccl_virtual_gpu_comm;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_module_t = virtual_device_coll_module<algo_type, group, mode>;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using kernel_class_t = typename gpu_module_t<algo_type, group, mode>::main_class;
-
-    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
-    using gpu_kernel_t = typename kernel_class_t<algo_type, group, mode>::kernel_t;
-
-    using supported_modules = supported_device_modules<gpu_module_t>;
-
-    static constexpr const char* name_impl() {
-        return "VIRTUAL_GPU";
-    }
-
-    std::string to_string_impl() const;
-
-    ccl_virtual_gpu_comm(ccl_device& device, comm_rank_t idx, ccl_gpu_comm& real_gpu);
-    ~ccl_virtual_gpu_comm() = default;
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    topology_addr<group_id, class_id> get_real_comm_data() const {
-        return real_gpu_comm.get_comm_data<group_id, class_id>();
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_module_t<module_type, group_id, class_id>& get_gpu_module() {
-        auto& ptr =
-            base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
-                registered_modules);
-        assert(ptr);
-        return *ptr;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    gpu_kernel_t<module_type, group_id, class_id>& get_gpu_kernel(const coll_param_gpu& params) {
-        auto& ptr = get_gpu_module<module_type, group_id, class_id>();
-
-        using requested_class = kernel_class_t<module_type, group_id, class_id>;
-        return ptr.template get_class<requested_class>().get(params);
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class gpu_entry>
-    gpu_kernel_t<gpu_entry::type(), group_id, class_id>& register_entry(gpu_entry& entry) {
-        const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
-        LOG_DEBUG("entry: ", gpu_entry::class_name(), " registered on: ", comm_addr.to_string());
-
-        auto& main_func = get_gpu_kernel<gpu_entry::type(), group_id, class_id>(entry.get_params());
-        main_func.set_rank(comm_addr.rank);
-        main_func.set_size(comm_addr.size);
-        return main_func;
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id>
-    std::string create_module_impl(const ze_module_desc_t& module_data) {
-        //virtual based on real
-        auto real_kernel = real_gpu_comm.get_gpu_module_ptr<module_type, group_id, class_id>();
-
-        std::get<::utils::enum_to_underlying(class_id)>(
-            std::get<::utils::enum_to_underlying(group_id)>(
-                std::get<module_type>(registered_modules)))
-            .reset(new gpu_module_t<module_type, group_id, class_id>(real_kernel));
-        return { "virtual module" };
-    }
-
-    cmd_list_proxy get_cmd_list(std::shared_ptr<ccl_context> ctx,
-                                const ze_command_list_desc_t& properties);
-
-    fence_proxy get_fence(const ccl_device::device_queue& cmd_queue,
-                          std::shared_ptr<ccl_context> ctx);
-
-private:
-    ccl_gpu_comm& real_gpu_comm;
-    supported_modules registered_modules;
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp b/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp
deleted file mode 100644
index e3af787ef..000000000
--- a/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "include/oneapi/ccl/types.hpp"
-#include "coll/algorithm/algorithms_enum.hpp"
-
-namespace native {
-
-template <class... T>
-struct communiaction_data_holder {
-    template <class U, ccl_coll_type type>
-    struct data_for_algo_t {
-        U data;
-    };
-
-    template <class Data, ccl_coll_type... types>
-    using data_storage_t = std::tuple<data_for_algo_t<Data, types>...>;
-};
-} // namespace native
-CCL_COLL_TYPE_LIST
diff --git a/src/common/comm/l0/devices/communication_structs/communication_stream.cpp b/src/common/comm/l0/devices/communication_structs/communication_stream.cpp
deleted file mode 100644
index 8748ac8fd..000000000
--- a/src/common/comm/l0/devices/communication_structs/communication_stream.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/devices/communication_structs/communication_strea.hpp"
-
-namespace native {
-communication_stream::communication_stream(ccl_device& device, std::shared_ptr<ccl_context> ctx) {
-    device_queue = std::make_shared<ccl_device::device_queue>(device->create_cmd_queue(ctx));
-    referenced_communication_device_count = 0;
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/communication_structs/communication_stream.hpp b/src/common/comm/l0/devices/communication_structs/communication_stream.hpp
deleted file mode 100644
index b71c434ed..000000000
--- a/src/common/comm/l0/devices/communication_structs/communication_stream.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-
-namespace native {
-struct communication_stream {
-    communication_stream(ccl_device& device, std::shared_ptr<ccl_context> ctx);
-
-    std::shared_ptr<ccl_device::device_queue> device_queue;
-    std::atomic<size_t> referenced_communication_device_count;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/devices/communication_structs/connection.cpp b/src/common/comm/l0/devices/communication_structs/connection.cpp
deleted file mode 100644
index 277bbecd9..000000000
--- a/src/common/comm/l0/devices/communication_structs/connection.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdexcept>
-
-#include "common/log/log.hpp"
-#include "common/comm/l0/devices/communication_structs/connection.hpp"
-
-namespace net {
-
-connection::connection(int connected_socket) : socket_fd(connected_socket) {}
-
-connection::~connection() {
-    shutdown(socket_fd, SHUT_RDWR);
-    close(socket_fd);
-}
-
-ssize_t connection::send_data(const uint8_t* send_data_ptr, size_t size, int flags) const {
-    LOG_TRACE("Send data size: ", size, ", into socket: ", socket_fd);
-
-    if (!send_data_ptr) {
-        return 0;
-    }
-
-    ssize_t ret = 0;
-    do {
-        ret = send(socket_fd, send_data_ptr, size, flags);
-    } while (ret == -1 && errno == EINTR);
-
-    if (ret == -1 && (errno != EAGAIN || errno != EWOULDBLOCK)) {
-        throw std::runtime_error(std::string("Cannot send data to socket: ") +
-                                 std::to_string(socket_fd) + strerror(errno));
-    }
-    LOG_TRACE("Data bytes sent: ", ret, ", to socket: ", socket_fd);
-    return ret;
-}
-
-ssize_t connection::recv_data(uint8_t* recv_data_ptr, size_t size, int flags) const {
-    LOG_TRACE("Recv data size: ", size, ", from socket: ", socket_fd);
-
-    if (!recv_data_ptr) {
-        return 0;
-    }
-
-    ssize_t ret = 0;
-    do {
-        ret = recv(socket_fd, recv_data_ptr, size, flags);
-    } while (ret == -1 && errno == EINTR);
-
-    if (ret == -1 && (errno != EAGAIN || errno != EWOULDBLOCK)) {
-        throw std::runtime_error(std::string("Cannot recv data from socket: ") +
-                                 std::to_string(socket_fd) + strerror(errno));
-    }
-    LOG_TRACE("Data bytes received: ", ret, ", from socket: ", socket_fd);
-    return ret;
-}
-
-ssize_t connection::send_msg_with_pid_data(const std::vector<uint8_t>& data,
-                                           const std::vector<size_t>& optional_pid_data_offets,
-                                           int flag) const {
-    //TODO make sure limit doesn't exceed `/proc/sys/net/core/optmem_max`
-
-    if (connection::ancillary_data_limit_bytes() < optional_pid_data_offets.size() * sizeof(fd_t)) {
-        LOG_ERROR("ancillary_data_limit_bytes is to less: ",
-                  connection::ancillary_data_limit_bytes(),
-                  "bytes, than required: ",
-                  optional_pid_data_offets.size() * sizeof(fd_t),
-                  ". Recompile with large limits is required");
-        abort();
-    }
-
-    // fill regular data
-    struct msghdr msg = {};
-    struct iovec io = { .iov_base = const_cast<void*>(static_cast<const void*>(data.data())),
-                        .iov_len = data.size() * sizeof(uint8_t) };
-
-    // fill anciliary data
-    msg.msg_iov = &io;
-    msg.msg_iovlen = 1;
-    msg.msg_controllen =
-        CMSG_SPACE(sizeof(fd_t) * optional_pid_data_offets.size()); //sizeof(u.buf);
-    std::vector<uint8_t> staged_buf(msg.msg_controllen, 0);
-    msg.msg_control = staged_buf.data();
-
-    // one anciliary message for all fds
-    struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    cmsg->cmsg_len = CMSG_LEN(sizeof(fd_t) * optional_pid_data_offets.size());
-    fd_t* fdptr = (fd_t*)CMSG_DATA(cmsg);
-
-    for (auto fd_offset_bytes_it = optional_pid_data_offets.begin();
-         fd_offset_bytes_it != optional_pid_data_offets.end();
-         ++fd_offset_bytes_it) {
-        if (*fd_offset_bytes_it >= data.size()) {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     "Unexpected value in optional_pid_data_offets, data size: " +
-                                     std::to_string(data.size()) +
-                                     ", offset requested: " + std::to_string(*fd_offset_bytes_it));
-        }
-
-        const fd_t* in_fd_ptr = reinterpret_cast<const fd_t*>(data.data() + *fd_offset_bytes_it);
-        memcpy(fdptr, in_fd_ptr, sizeof(fd_t));
-        fdptr++;
-    }
-
-    ssize_t bytes_sent = sendmsg(socket_fd, &msg, flag);
-    LOG_DEBUG("sendmsg on socket: ", socket_fd, ", result: ", bytes_sent);
-    if (bytes_sent < 0) {
-        throw std::runtime_error(std::string(__FUNCTION__) + " - cannot sendmsg on socket: " +
-                                 std::to_string(socket_fd) + ", error: " + strerror(errno));
-    }
-    return bytes_sent;
-}
-
-ssize_t connection::recv_msg_with_pid_data(std::vector<uint8_t>& out_data_resized,
-                                           std::vector<fd_t>& out_pids_resized,
-                                           int flags) const {
-    LOG_DEBUG("Prepared data size bytes: ",
-              out_data_resized.size(),
-              ", pid count: ",
-              out_pids_resized.size(),
-              ", socket: ",
-              socket_fd);
-
-    // prepare regular data
-    struct iovec msg_buffer;
-    msg_buffer.iov_base = out_data_resized.data();
-    msg_buffer.iov_len = out_data_resized.size();
-
-    // prepare control data
-    struct msghdr msg_header = {};
-    msg_header.msg_iov = &msg_buffer;
-    msg_header.msg_iovlen = 1;
-    msg_header.msg_controllen = CMSG_SPACE(sizeof(fd_t) * out_pids_resized.size()); //sizeof(u.buf);
-
-    std::vector<uint8_t> staged_buf(msg_header.msg_controllen, 0);
-    msg_header.msg_control = staged_buf.data();
-
-    ssize_t bytes_got = 0;
-    do {
-        bytes_got = recvmsg(socket_fd, &msg_header, flags);
-    } while (bytes_got == -1 && errno == EINTR);
-
-    if (bytes_got == -1) {
-        throw std::runtime_error(std::string(__FUNCTION__) +
-                                 " - cannot receive data, error: " + strerror(errno));
-    }
-
-    LOG_DEBUG("Received bytes: ", bytes_got, ", from socket: ", socket_fd);
-
-    size_t received_fd_num = 0;
-    struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg_header);
-    for (; cmsg != nullptr; cmsg = CMSG_NXTHDR(&msg_header, cmsg)) {
-        LOG_TRACE("cmsg_level: ", cmsg->cmsg_level, ", cmsg_type: ", cmsg->cmsg_type);
-        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
-            // sanity
-            size_t expected_len = CMSG_LEN(out_pids_resized.size() * sizeof(fd_t));
-            if (cmsg->cmsg_len < expected_len) {
-                throw std::runtime_error(std::string(__FUNCTION__) +
-                                         " - got unexpected anciliary msg size from socket: " +
-                                         std::to_string(socket_fd) +
-                                         ", got: " + std::to_string(cmsg->cmsg_len) +
-                                         ", but expected: " + std::to_string(expected_len));
-            }
-
-            // restore duplicated fds
-            fd_t* fd_ptr = (fd_t*)CMSG_DATA(cmsg);
-            for (auto& it : out_pids_resized) {
-                it = *fd_ptr;
-                LOG_DEBUG("got fd: ",
-                          *fd_ptr,
-                          ", by number: ",
-                          received_fd_num,
-                          ", expected count: ",
-                          out_pids_resized.size());
-
-                fd_ptr++;
-                received_fd_num++;
-            }
-        }
-    }
-
-    LOG_DEBUG("Received fd count: ", received_fd_num);
-    if (received_fd_num != out_pids_resized.size()) {
-        throw std::runtime_error(std::string(__FUNCTION__) +
-                                 " - unexpected FD from socket: " + std::to_string(socket_fd) +
-                                 ", received count: " + std::to_string(received_fd_num) +
-                                 ", but expected: " + std::to_string(out_pids_resized.size()));
-    }
-    return bytes_got;
-}
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/connection.hpp b/src/common/comm/l0/devices/communication_structs/connection.hpp
deleted file mode 100644
index 12894cb38..000000000
--- a/src/common/comm/l0/devices/communication_structs/connection.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-namespace net {
-
-class connection {
-public:
-    using fd_t = int;
-
-    static constexpr size_t fd_size_bytes() {
-        return sizeof(fd_t);
-    }
-
-    static constexpr size_t ancillary_data_limit_fd() {
-        return 10;
-    }
-    static constexpr size_t ancillary_data_limit_bytes() {
-        return ancillary_data_limit_fd() * fd_size_bytes(); // 10 fd is limitation
-    }
-
-    connection(const connection& src) = delete;
-    connection& operator=(const connection& src) = delete;
-
-    ssize_t send_msg_with_pid_data(const std::vector<uint8_t>& data,
-                                   const std::vector<size_t>& optional_pid_data_offets,
-                                   int flag = 0) const;
-
-    // For properly call resize data with *_resized before for expected sizes
-    ssize_t recv_msg_with_pid_data(std::vector<uint8_t>& out_data_resized,
-                                   std::vector<fd_t>& out_pids_resized,
-                                   int flags = 0 /*=MSG_CMSG_CLOEXEC | MSG_WAITALL */) const;
-
-    ssize_t send_data(const uint8_t* send_data_ptr, size_t size, int flags = 0) const;
-    ssize_t recv_data(uint8_t* recv_data_ptr, size_t size, int flags = 0) const;
-
-protected:
-    explicit connection(int connected_socket);
-    ~connection();
-
-    int socket_fd{ -1 };
-};
-
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_client.cpp b/src/common/comm/l0/devices/communication_structs/ipc_client.cpp
deleted file mode 100644
index 32d2f6ac9..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_client.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdexcept>
-#include <fcntl.h>
-#include <algorithm>
-
-#include "common/log/log.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_client.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_connection.hpp"
-
-namespace net {
-
-ipc_client::~ipc_client() {
-    stop_all();
-}
-
-std::shared_ptr<ipc_tx_connection> ipc_client::create_connection(const std::string& addr) {
-    LOG_DEBUG("Create or find existing connection to: ", addr);
-    auto it = connections.find(addr);
-    if (it != connections.end()) {
-        LOG_DEBUG("Get existing conenction");
-        return it->second;
-    }
-
-    std::shared_ptr<ipc_tx_connection> tx_conn;
-    try {
-        tx_conn.reset(new ipc_tx_connection(addr));
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR(
-            "Cannot create TX connection to other IPC server on: ", addr, ", error: ", ex.what());
-        throw;
-    }
-
-    connections.emplace(addr, tx_conn);
-
-    LOG_DEBUG("Connections created, total tx connections: ", connections.size());
-    return tx_conn;
-}
-
-bool ipc_client::stop_all() {
-    LOG_DEBUG("Stop connections: ", connections.size());
-    for (auto& conn_pair : connections) {
-        LOG_DEBUG("schedule stop connection to: ", conn_pair.first);
-        conn_pair.second.reset();
-    }
-    return true;
-}
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_client.hpp b/src/common/comm/l0/devices/communication_structs/ipc_client.hpp
deleted file mode 100644
index 7a0869d88..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_client.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-#include <memory>
-#include <string>
-
-namespace net {
-
-class ipc_tx_connection;
-
-class ipc_client {
-public:
-    ~ipc_client();
-
-    std::shared_ptr<ipc_tx_connection> create_connection(const std::string& addr);
-    bool stop_all();
-
-private:
-    std::map<std::string, std::shared_ptr<ipc_tx_connection>> connections;
-};
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_connection.cpp b/src/common/comm/l0/devices/communication_structs/ipc_connection.cpp
deleted file mode 100644
index d9dbb09a1..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_connection.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdexcept>
-
-#include "common/log/log.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_connection.hpp"
-
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
-#include "oneapi/ccl/native_device_api/l0/platform.hpp"
-#include "oneapi/ccl/native_device_api/l0/context.hpp"
-
-namespace net {
-
-// Rx receive IPC data from IPC_SOURCE_DEVICE
-ipc_rx_connection::ipc_rx_connection(int socket) : connection(socket) {
-    //disable WRITE
-    shutdown(socket_fd, SHUT_WR);
-}
-
-std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-ipc_rx_connection::receive_ipc_memory(std::vector<uint8_t>& out_data_resized,
-                                      size_t& out_received_rank) const {
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> ret =
-        receive_ipc_memory_ext(out_data_resized, sizeof(size_t));
-
-    out_received_rank = *(reinterpret_cast<const size_t*>(out_data_resized.data()));
-    LOG_DEBUG("Received IPC handles count: ", ret.size(), ", from rank: ", out_received_rank);
-    return ret;
-}
-
-// For properly call resize data with *_resized before for expected sizes
-std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-ipc_rx_connection::receive_ipc_memory_ext(std::vector<uint8_t>& out_data_resized,
-                                          size_t out_data_offset_bytes) const {
-    LOG_DEBUG("Try to receive ip memory for expected bytes count: ", out_data_resized.size());
-
-    constexpr size_t handle_size =
-        native::ccl_device::device_ipc_memory_handle::get_size_for_serialize();
-
-    size_t handles_count = (out_data_resized.size() - out_data_offset_bytes) / handle_size;
-    size_t bytes_rest = (out_data_resized.size() - out_data_offset_bytes) % handle_size;
-
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> ret;
-    LOG_DEBUG("Expected receive bytes: ",
-              out_data_resized.size(),
-              ", handles count: ",
-              handles_count,
-              ", bytes in rest: ",
-              bytes_rest);
-    try {
-        if (bytes_rest != 0) {
-            throw std::runtime_error(
-                std::string("Unexpected bytes to receive: ") +
-                std::to_string(out_data_resized.size()) +
-                ", handles count: " + std::to_string(handles_count) +
-                ", bytes in rest should be zero, got: " + std::to_string(bytes_rest));
-        }
-
-        // receive data
-        std::vector<connection::fd_t> out_pids_resized(handles_count, 0);
-        ret = receive_raw_ipc_memory_ext(out_data_resized, out_pids_resized, out_data_offset_bytes);
-
-        LOG_DEBUG("Received IPC handles: ", ret.size(), ", with offset: ", out_data_offset_bytes);
-        size_t num_handles = 0;
-        for (auto& handle : ret) {
-            // override duplicated fd by SCM_RIGHTS
-            connection::fd_t* pid_ptr = reinterpret_cast<connection::fd_t*>(handle->get_ptr());
-            connection::fd_t new_pid = out_pids_resized[num_handles];
-
-            LOG_DEBUG("Override FD: ",
-                      *pid_ptr,
-                      ", with new FD: ",
-                      new_pid,
-                      ", for IPC handle num: ",
-                      num_handles);
-            *pid_ptr = new_pid;
-            num_handles++;
-        }
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR("Cannot receive IPC handles, error:\n", ex.what());
-        throw;
-    }
-    return ret;
-}
-
-std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-ipc_rx_connection::receive_raw_ipc_memory(std::vector<uint8_t>& out_data_resized,
-                                          std::vector<connection::fd_t>& out_pids_resized,
-                                          size_t& received_rank) const {
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> ret =
-        receive_raw_ipc_memory_ext(out_data_resized, out_pids_resized, sizeof(size_t));
-
-    received_rank = *(reinterpret_cast<const size_t*>(out_data_resized.data()));
-    LOG_DEBUG("Deserialized IPC handles count: ", ret.size(), ", from rank: ", received_rank);
-    return ret;
-}
-
-std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-ipc_rx_connection::receive_raw_ipc_memory_ext(std::vector<uint8_t>& out_data_resized,
-                                              std::vector<connection::fd_t>& out_pids_resized,
-                                              size_t out_data_offset_bytes) const {
-    LOG_DEBUG("Try to receive ip memory for expected bytes count: ",
-              out_data_resized.size(),
-              ", fd count: ",
-              out_pids_resized.size(),
-              ", with offset",
-              out_data_offset_bytes);
-    if (out_data_resized.size() < out_data_offset_bytes) {
-        LOG_ERROR("not enough bytes in out_data_resized: ",
-                  out_data_resized.size(),
-                  ", for given offset: ",
-                  out_data_offset_bytes);
-        abort();
-    }
-
-    ssize_t read_bytes = 0;
-    try {
-        read_bytes = recv_msg_with_pid_data(
-            out_data_resized, out_pids_resized, MSG_CMSG_CLOEXEC | MSG_WAITALL);
-        LOG_DEBUG("Read bytes count: ", read_bytes);
-
-        if (static_cast<size_t>(read_bytes) < out_data_resized.size()) {
-            throw std::runtime_error(std::string("Too many bytes received: ") +
-                                     std::to_string(read_bytes));
-        }
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR("Cannot receive IPC handles, error: ", ex.what());
-        throw;
-    }
-
-    //get ipc handles
-    size_t recv_data_size = out_data_resized.size();
-    const uint8_t* recv_data_start = out_data_resized.data();
-
-    recv_data_start += out_data_offset_bytes;
-    recv_data_size -= out_data_offset_bytes;
-
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> ret;
-    ret.reserve(out_pids_resized.size());
-
-    size_t num_handles = 0;
-    size_t expected_handles = out_pids_resized.size();
-    std::shared_ptr<native::ccl_device_platform> ipc_platform;
-    std::shared_ptr<native::ccl_context> ctx;
-    LOG_DEBUG("Deserialize IPC handles count: ", expected_handles);
-    while (num_handles < expected_handles and recv_data_size > 0) {
-        LOG_DEBUG(
-            "Start restore handle num: ", num_handles, ", expected count: ", expected_handles);
-        try {
-            // deserialize handle
-            auto recv_ip_handle = native::ccl_device::device_ipc_memory_handle::deserialize<
-                native::ccl_device::device_ipc_memory_handle>(
-                &recv_data_start, recv_data_size, ctx, ipc_platform);
-            // remember ipc handle
-            ret.push_back(std::move(recv_ip_handle));
-        }
-        catch (const std::exception& ex) {
-            LOG_ERROR(
-                "Cannot deserialize IPC handle by index: ", num_handles, ", error:\n", ex.what());
-            throw;
-        }
-        num_handles++;
-    }
-
-    LOG_DEBUG("Deserialized IPC handles count: ", ret.size());
-    return ret;
-}
-
-// Tx receive IPC data from IPC_SOURCE_DEVICE
-ipc_tx_connection::ipc_tx_connection(const std::string& addr) : connection(-1) {
-    try {
-        socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-        if (socket_fd == -1) {
-            throw std::runtime_error(std::string("Cannot create client socket, error: ") +
-                                     strerror(errno));
-        }
-
-        memset(&peer_addr, 0, sizeof(struct sockaddr_un));
-        peer_addr.sun_family = AF_UNIX;
-        strncpy(peer_addr.sun_path, addr.c_str(), sizeof(peer_addr.sun_path) - 1);
-        peer_addr.sun_path[sizeof(peer_addr.sun_path) - 1] = '\0';
-
-        // make connect
-        int ret = -1;
-        while ((ret = connect(
-                    socket_fd, (const struct sockaddr*)&peer_addr, sizeof(struct sockaddr_un))) ==
-               -1)
-            if (errno != EINTR && errno != EINPROGRESS) {
-                throw std::runtime_error(std::string("Cannot connect socket: ") +
-                                         std::to_string(socket_fd) + " to peer: " + addr +
-                                         ", error: " + strerror(errno));
-            }
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR(ex.what());
-        throw;
-    }
-
-    LOG_DEBUG("Socket connected: ", socket_fd, " to peer: ", addr);
-
-    //disable READ
-    shutdown(socket_fd, SHUT_RD);
-}
-
-std::vector<uint8_t> ipc_tx_connection::send_ipc_memory(
-    const std::vector<native::ccl_device::device_ipc_memory_handle>& handles,
-    size_t send_rank) const {
-    return send_ipc_memory_ext(handles, reinterpret_cast<uint8_t*>(&send_rank), sizeof(send_rank));
-}
-
-std::vector<uint8_t> ipc_tx_connection::send_ipc_memory_ext(
-    const std::vector<native::ccl_device::device_ipc_memory_handle>& handles,
-    const uint8_t* payload,
-    size_t payload_size) const {
-    LOG_DEBUG("Send IPC handles: ", handles.size(), ", payload size: ", payload_size);
-    for (const auto& h : handles) {
-        LOG_DEBUG("handle: ", native::to_string(h.get()));
-    }
-
-    std::vector<uint8_t> out_raw_data;
-    size_t out_raw_data_initial_offset_bytes = payload_size;
-
-    constexpr size_t handle_size =
-        native::ccl_device::device_ipc_memory_handle::get_size_for_serialize();
-
-    size_t bytes_to_send = handle_size * handles.size() + out_raw_data_initial_offset_bytes;
-    out_raw_data.resize(bytes_to_send);
-
-    // fill send_buf & pid buf
-    std::vector<size_t> pids_offset_bytes;
-    pids_offset_bytes.reserve(handles.size());
-
-    size_t serialize_offset = out_raw_data_initial_offset_bytes;
-    for (const auto& ipc_handle : handles) {
-        serialize_offset += ipc_handle.serialize(out_raw_data, serialize_offset);
-        pids_offset_bytes.push_back(serialize_offset -
-                                    sizeof(native::ccl_device::device_ipc_memory_handle::handle_t) -
-                                    sizeof(size_t));
-
-        LOG_DEBUG("Serialized bytes: ",
-                  serialize_offset,
-                  ", with pid offset by: ",
-                  pids_offset_bytes.back());
-    }
-
-    memcpy(reinterpret_cast<uint8_t*>(out_raw_data.data()),
-           payload,
-           out_raw_data_initial_offset_bytes);
-
-    CCL_ASSERT(serialize_offset == bytes_to_send,
-               "Expected data to send and actually serialized are differ");
-
-    ssize_t send_bytes = 0;
-    try {
-        send_bytes = connection::send_msg_with_pid_data(out_raw_data, pids_offset_bytes);
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR("Cannot send IPC handles, error: ", ex.what());
-        throw;
-    }
-
-    LOG_DEBUG("Handles serialized count: ",
-              handles.size(),
-              ", data bytes: ",
-              serialize_offset,
-              ", sent bytes: ",
-              send_bytes);
-    return out_raw_data;
-}
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_connection.hpp b/src/common/comm/l0/devices/communication_structs/ipc_connection.hpp
deleted file mode 100644
index a02516f35..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_connection.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <vector>
-#include "oneapi/ccl/native_device_api/l0/device.hpp"
-#include "common/comm/l0/devices/communication_structs/connection.hpp"
-
-namespace net {
-
-// Rx receive IPC data from IPC_SOURCE_DEVICE
-class ipc_rx_connection : public connection {
-public:
-    explicit ipc_rx_connection(int socket);
-
-    // For properly call resize data with *_resized before for expected sizes
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>> receive_ipc_memory(
-        std::vector<uint8_t>& out_data_resized,
-        size_t& out_received_rank) const;
-
-    // For properly call resize data with *_resized before for expected sizes
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-    receive_ipc_memory_ext(std::vector<uint8_t>& out_data_resized,
-                           size_t out_data_offset_bytes) const;
-
-    // For properly call resize data with *_resized before for expected sizes
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-    receive_raw_ipc_memory(std::vector<uint8_t>& out_data_resized,
-                           std::vector<connection::fd_t>& out_pids_resized,
-                           size_t& out_rank) const;
-
-    // For properly call resize data with *_resized before for expected sizes
-    std::vector<std::shared_ptr<native::ccl_device::device_ipc_memory_handle>>
-    receive_raw_ipc_memory_ext(std::vector<uint8_t>& out_data_resized,
-                               std::vector<connection::fd_t>& out_pids_resized,
-                               size_t out_data_offset_bytes) const;
-};
-
-// Tx receive IPC data from IPC_SOURCE_DEVICE
-class ipc_tx_connection : public connection {
-public:
-    explicit ipc_tx_connection(const std::string& addr);
-
-    std::vector<uint8_t> send_ipc_memory(
-        const std::vector<native::ccl_device::device_ipc_memory_handle>& handles,
-        size_t send_rank) const;
-    std::vector<uint8_t> send_ipc_memory_ext(
-        const std::vector<native::ccl_device::device_ipc_memory_handle>& handles,
-        const uint8_t* payload = nullptr,
-        size_t payload_size = 0) const;
-
-private:
-    sockaddr_un peer_addr{};
-};
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_server.cpp b/src/common/comm/l0/devices/communication_structs/ipc_server.cpp
deleted file mode 100644
index 4259b8b99..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_server.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdexcept>
-#include <fcntl.h>
-#include <algorithm>
-
-#include "common/log/log.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_server.hpp"
-#include "common/comm/l0/devices/communication_structs/ipc_connection.hpp"
-
-namespace net {
-
-ipc_server::~ipc_server() {
-    stop();
-    unlink(server_shared_name.c_str());
-}
-
-void ipc_server::start(const std::string& path, int expected_backlog_size) {
-    LOG_INFO("Starting IPC server on addr: ", path);
-    if (is_ready()) {
-        throw std::runtime_error(std::string("Cannot restart ipc server with addr: ") + path);
-    }
-
-    size_t path_size_limit = sizeof(server_addr.sun_path) - 1;
-    if (path.size() > path_size_limit) {
-        throw std::runtime_error(std::string("Cannot start ipc server on requested addr: ") + path +
-                                 " - addr size if too long: " + std::to_string(path.size()) +
-                                 ", expected: " + std::to_string(path_size_limit));
-    }
-    path_size_limit = std::min(path_size_limit, path.size());
-
-    LOG_TRACE("Reset previously locked handle");
-    unlink(path.c_str());
-
-    try {
-        listen_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-        if (listen_fd == -1) {
-            throw std::runtime_error(std::string("Cannot create socket, error: ") +
-                                     strerror(errno));
-        }
-
-        // set non blocking
-        int fileflags = fcntl(listen_fd, F_GETFL, 0);
-        if (fileflags == -1) {
-            throw std::runtime_error(std::string("Cannot get fcntl socket flags, error: ") +
-                                     strerror(errno));
-        }
-        if (fcntl(listen_fd, F_SETFL, fileflags | O_NONBLOCK) == -1) {
-            throw std::runtime_error(std::string("Cannot set non-blocking socket, error: ") +
-                                     strerror(errno));
-        }
-
-        //allow reuse
-        int enable = 1;
-        if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)) < 0) {
-            throw std::runtime_error(std::string("Cannot set reuse socket, error: ") +
-                                     strerror(errno));
-        }
-
-        memset(&server_addr, 0, sizeof(struct sockaddr_un));
-        server_addr.sun_family = AF_UNIX;
-        strncpy(server_addr.sun_path, path.c_str(), path_size_limit);
-        server_addr.sun_path[path_size_limit] = '\0';
-
-        int ret = bind(listen_fd, (const struct sockaddr*)&server_addr, sizeof(struct sockaddr_un));
-        if (ret == -1) {
-            throw std::runtime_error(std::string("Cannot bind socket by addr: ") + path +
-                                     ", error: " + strerror(errno));
-        }
-
-        ret = listen(listen_fd, expected_backlog_size);
-        if (ret == -1) {
-            throw std::runtime_error(std::string("Cannot start listen socket by addr: ") + path +
-                                     ", error: " + strerror(errno));
-        }
-
-        server_shared_name = path;
-    }
-    catch (const std::exception& ex) {
-        LOG_ERROR(ex.what());
-        throw;
-    }
-}
-
-bool ipc_server::stop() {
-    bool ret = false;
-    if (is_ready()) {
-        LOG_DEBUG("Gracefully stop listener: ", listen_fd);
-        shutdown(listen_fd, SHUT_RDWR);
-        close(listen_fd);
-        listen_fd = -1;
-        ret = true;
-    }
-    else {
-        LOG_DEBUG("Nothing to stop");
-    }
-    return ret;
-}
-
-bool ipc_server::is_ready() const noexcept {
-    return listen_fd != -1;
-}
-
-std::unique_ptr<ipc_rx_connection> ipc_server::process_connection() {
-    if (!is_ready()) {
-        throw std::runtime_error(std::string(__FUNCTION__) + " - failed, ipc server is not ready");
-    }
-
-    std::unique_ptr<ipc_rx_connection> ret;
-
-    int fd = accept(listen_fd, nullptr, nullptr);
-    if (fd == -1) {
-        if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - failed, accept failed on socket: " +
-                                     std::to_string(listen_fd) + ", error: " + strerror(errno));
-        }
-        LOG_TRACE("Nothing to accept on socket:", listen_fd);
-    }
-    else {
-        ret.reset(new ipc_rx_connection(fd));
-    }
-
-    return ret;
-}
-} // namespace net
diff --git a/src/common/comm/l0/devices/communication_structs/ipc_server.hpp b/src/common/comm/l0/devices/communication_structs/ipc_server.hpp
deleted file mode 100644
index 34357e062..000000000
--- a/src/common/comm/l0/devices/communication_structs/ipc_server.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-#include <memory>
-#include <string>
-
-namespace net {
-
-class ipc_rx_connection;
-
-class ipc_server {
-public:
-    ~ipc_server();
-
-    void start(const std::string& path, int expected_backlog_size = 0);
-    bool stop();
-    bool is_ready() const noexcept;
-
-    std::unique_ptr<ipc_rx_connection> process_connection();
-
-private:
-    int listen_fd{ -1 };
-    sockaddr_un server_addr{};
-    std::string server_shared_name{};
-};
-} // namespace net
diff --git a/src/common/comm/l0/devices/devices_declaration.hpp b/src/common/comm/l0/devices/devices_declaration.hpp
deleted file mode 100644
index 061dfae43..000000000
--- a/src/common/comm/l0/devices/devices_declaration.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/devices/ccl_gpu_comm.hpp"
-#include "common/comm/l0/devices/ccl_ipc_gpu_comm.hpp"
-#include "common/comm/l0/devices/ccl_virtual_gpu_comm.hpp"
-#include "common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp"
-#include "common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp"
-#include "common/comm/l0/devices/ccl_numa_proxy.hpp"
-#include "common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp"
-#include "common/comm/l0/devices/ccl_scaleout_proxy.hpp"
diff --git a/src/common/comm/l0/devices/proxy_observer.hpp b/src/common/comm/l0/devices/proxy_observer.hpp
deleted file mode 100644
index 7fcbe6a8d..000000000
--- a/src/common/comm/l0/devices/proxy_observer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/device_types.hpp"
-
-namespace native {
-
-template <class device_t>
-class proxy_observer {
-public:
-    using impl = device_t;
-    using type_idx_t = typename std::underlying_type<gpu_types>::type;
-
-    static constexpr type_idx_t idx() {
-        return impl::type_idx() - gpu_types::SCALING_PROXY_GPU_TYPES;
-    }
-
-    template <class... Args>
-    void notify(Args&&... args) {
-        get_this()->notify_impl(std::forward<Args>(args)...);
-    }
-
-    impl* get_this() {
-        return static_cast<device_t*>(this);
-    }
-
-    const impl* get_this() const {
-        return static_cast<const impl*>(this);
-    }
-
-private:
-};
-} // namespace native
diff --git a/src/common/comm/l0/devices/proxy_observer_types.hpp b/src/common/comm/l0/devices/proxy_observer_types.hpp
deleted file mode 100644
index db03ef8d9..000000000
--- a/src/common/comm/l0/devices/proxy_observer_types.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "proxy_observer.hpp"
-
-namespace native {
-struct device_group_context;
-struct thread_group_context;
-struct process_group_context;
-
-template <class... Args>
-struct invoke_param_type_pack {
-    using params_t = std::tuple<Args...>;
-};
-
-template <class impl, class... context_t>
-class proxy_multiple_observer : public proxy_observer<impl> {
-    using registered_contexts = std::tuple<typename std::add_pointer<context_t>::type...>;
-    registered_contexts contexts;
-
-public:
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class ctx_reg_t,
-              class ctx_t>
-    void assign(ctx_reg_t& ctx_to_reg, ctx_t& ctx) {
-        // context linkage
-        ctx.template attach<group_id, class_id>(static_cast<impl*>(this));
-        std::get<::utils::enum_to_underlying(group_id)>(contexts) = &ctx_to_reg;
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-    void reassign_with_addr(size_t rank_addr) {
-        auto& ctx_type = std::get<::utils::enum_to_underlying(group_id)>(contexts);
-        assert(ctx_type && "Attempt to reassign observer on non-initializer context");
-
-        auto* scaling_ctx = ctx_type->template dispatch_context<impl>();
-        scaling_ctx->template reattach_with_addr<group_id, class_id>(rank_addr,
-                                                                     static_cast<impl*>(this));
-    }
-
-    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class... Args>
-    void invoke(Args&&... args) {
-        //use context to invoke/register proxy jobs
-
-        auto& ctx_type = std::get<::utils::enum_to_underlying(group_id)>(contexts);
-
-        assert(ctx_type && "Attempt to invoke observer on non-initializer context");
-        auto* scaling_ctx = ctx_type->template dispatch_context<impl>();
-        scaling_ctx->template invoke_proxy<group_id, class_id, impl>(static_cast<impl*>(this),
-                                                                     std::forward<Args>(args)...);
-    }
-};
-
-template <class impl_t>
-using proxy_observer_specific = proxy_multiple_observer<impl_t, device_group_context,
-                                                                thread_group_context,
-                                                                process_group_context
-/*
-                                                                device_group_context,
-                                                                thread_group_context,
-                                                                process_group_context
- */                                                               >;
-} // namespace native
diff --git a/src/common/comm/l0/gpu_comm_attr.cpp b/src/common/comm/l0/gpu_comm_attr.cpp
deleted file mode 100644
index 7b348668d..000000000
--- a/src/common/comm/l0/gpu_comm_attr.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/device_community.hpp"
-#include "common/comm/comm_interface.hpp"
-#include "common/comm/l0/context/process_group_ctx.hpp"
-
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-namespace ccl {
-
-thread_local size_t gpu_comm_attr::thread_id = 0;
-
-gpu_comm_attr::gpu_comm_attr(std::shared_ptr<host_communicator> parent_comm,
-                             size_t thread_count,
-                             size_t process_device_size,
-                             group_unique_key id)
-        : ccl_communicator(parent_comm),
-          expected_threads_count(thread_count),
-          expected_process_device_size(process_device_size),
-          unique_id(id) {
-    ctx = std::make_shared<native::process_group_context>(ccl_communicator);
-}
-
-const group_unique_key& gpu_comm_attr::get_unique_id() const {
-    return unique_id;
-}
-
-std::shared_ptr<host_communicator> gpu_comm_attr::get_host_communicator() {
-    return ccl_communicator;
-}
-
-bool gpu_comm_attr::sync_group_size(size_t device_group_size) {
-    std::unique_lock<std::mutex> lock(thread_group_size_mutex);
-
-    thread_id = thread_device_group_sizes.size();
-
-    thread_device_group_sizes.push_back(device_group_size);
-    if (thread_device_group_sizes.size() != expected_threads_count) {
-        // slave threads
-        thread_group_size_cond.wait(lock, [this]() {
-            return ready;
-        });
-        return false;
-    }
-
-    // master thread
-    ccl::stream::impl_value_t empty_stream{};
-    ccl_communicator->barrier_impl(empty_stream, ccl::default_barrier_attr, {});
-
-    ready = true;
-    thread_group_size_cond.notify_all();
-    return true;
-}
-
-gpu_comm_attr::~gpu_comm_attr() {}
-
-bool gpu_comm_attr::sync_register_communicator(std::shared_ptr<communicator_interface> comm) {
-    if (!delegate_sync_register_communicator(comm)) {
-        //SlAVE threads or non-completed ccommunicators count
-        if (barrier.communicator_ready) {
-            // SLAVE thread
-            LOG_DEBUG("Process Group for thread(SLAVE) id: ", thread_id, " is ready");
-        }
-        return false;
-    }
-
-    // MASTER thread
-    LOG_DEBUG("Process Group for thread(MASTER) id: ", thread_id, " is ready");
-    return true;
-}
-
-bool gpu_comm_attr::delegate_sync_register_communicator(
-    std::shared_ptr<communicator_interface> comm) {
-    ccl::device_indices_type device_group_indices;
-
-    std::unique_lock<std::mutex> lock(barrier.thread_group_mutex);
-
-    //sanity check for formed group
-    if (thread_communicators.count(thread_id) >= thread_device_group_sizes[thread_id]) {
-        //find device_id, we cannot add more device into group than expected
-        auto range = thread_communicators.equal_range(thread_id);
-        auto it = std::find_if(
-            range.first,
-            range.second,
-            [&comm](const typename thread_comm_storage::value_type& existing_comm) {
-                return comm->get_device_path() == existing_comm.second->get_device_path();
-            });
-
-        if (it == range.second) {
-            LOG_ERROR("Attempt to create communicator by new device id: ",
-                      comm->get_device_path(),
-                      " in fully formed comm_group is unaccepted!");
-            throw ccl::exception("cannot create communicator for requested device");
-        }
-
-        //set rank & size for duplicated communicator
-        comm->visit(*this);
-        return true;
-    }
-
-    //group is not formed yet
-    thread_communicators.insert({ thread_id, comm });
-    size_t registered = thread_communicators.count(thread_id);
-    size_t expected = thread_device_group_sizes[thread_id];
-    LOG_DEBUG("Thread id: ",
-              thread_id,
-              " register communicators count: [",
-              registered,
-              "/",
-              expected,
-              "]");
-    if (registered != expected) {
-        return false; //comm group is not reached expected size
-    }
-
-    // current thread create all own communicators, start sync context
-    auto range = thread_communicators.equal_range(thread_id);
-    for (auto it = range.first; it != range.second; ++it) {
-        device_group_indices.insert(
-            it->second->get_device_path()); //.get_device_properties().deviceId);
-    }
-    {
-        /* TODO: enable back */
-        // std::stringstream ss;
-        // for(const auto &path : device_group_indices)
-        // {
-        //     ss << path << ", ";
-        // }
-        // LOG_DEBUG("Thread id: ", thread_id, " collected device indices: ", ss.str());
-    }
-
-    //bind addr
-    context_comm_addr bind_thread_addr;
-    bind_thread_addr.thread_idx = thread_id;
-    bind_thread_addr.thread_count = expected_threads_count;
-    if (!ctx->sync_barrier(device_group_indices, bind_thread_addr)) {
-        //SLAVE thread waits
-        LOG_DEBUG("Thread (SLAVE) id: ", thread_id, " waits on barrier");
-        barrier.thread_group_sync_condition.wait(lock, [this]() {
-            return barrier.communicator_ready;
-        });
-
-        //flush cache
-        auto ready_count = std::count_if(thread_communicators.begin(),
-                                         thread_communicators.end(),
-                                         [](const typename thread_comm_storage::value_type& comm) {
-                                             return comm.second->is_ready();
-                                         });
-        if ((size_t)ready_count != expected_process_device_size) {
-            LOG_ERROR("Thread(SLAVE) id: ",
-                      thread_id,
-                      " not all communicators ready: (",
-                      ready_count,
-                      "/",
-                      expected_process_device_size,
-                      "). Abort");
-            abort();
-        }
-        LOG_DEBUG("Thread(SLAVE) id: ",
-                  thread_id,
-                  " detected communicators ready: (",
-                  ready_count,
-                  "/",
-                  expected_process_device_size,
-                  ")");
-        return false;
-    }
-
-    //MASTER threads
-
-    //finalize communicator creation
-    LOG_INFO("Finalize communicators creation, total count:", thread_communicators.size());
-    for (auto comm_it = thread_communicators.begin(); comm_it != thread_communicators.end();
-         ++comm_it) {
-        comm_it->second->visit(*this);
-    }
-
-    ccl::stream::impl_value_t empty_stream{};
-    ccl_communicator->barrier_impl(empty_stream, ccl::default_barrier_attr, {});
-
-    //notify SLAVES thread ready
-    barrier.communicator_ready = true;
-    barrier.thread_group_sync_condition.notify_all();
-    return true;
-}
-
-size_t gpu_comm_attr::get_expected_process_device_size() const noexcept {
-    return expected_process_device_size;
-}
-
-std::shared_ptr<native::process_group_context> gpu_comm_attr::get_process_context() {
-    return ctx;
-}
-
-} // namespace ccl
diff --git a/src/common/comm/l0/gpu_comm_attr.hpp b/src/common/comm/l0/gpu_comm_attr.hpp
deleted file mode 100644
index 99bd1e75e..000000000
--- a/src/common/comm/l0/gpu_comm_attr.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-#include <mutex>
-#include <condition_variable>
-
-#include "oneapi/ccl/types.hpp"
-#include "common/comm/l0/device_group_routing_schema.hpp"
-#include "common/comm/l0/context/context_barrier.hpp"
-
-#include "common/comm/l0/comm_context_id.hpp"
-#include "common/comm/l0/context_comm_addr.hpp"
-
-namespace native {
-struct process_group_context;
-struct thread_group_context;
-} // namespace native
-
-namespace ccl {
-namespace v1 {
-class communicator;
-}
-class host_communicator;
-struct communicator_interface;
-
-struct gpu_comm_attr {
-public:
-    friend class comm_group;
-
-    using thread_comm_storage = std::multimap<size_t, std::shared_ptr<communicator_interface>>;
-
-    gpu_comm_attr(std::shared_ptr<host_communicator> parent_comm,
-                  size_t thread_count,
-                  size_t process_device_size,
-                  group_unique_key id);
-    ~gpu_comm_attr();
-
-    std::shared_ptr<::native::process_group_context> get_process_context();
-    bool sync_group_size(size_t device_group_size);
-    bool sync_register_communicator(std::shared_ptr<communicator_interface> comm);
-
-    std::shared_ptr<host_communicator> get_host_communicator();
-
-    const group_unique_key& get_unique_id() const;
-    size_t get_expected_process_device_size() const noexcept;
-
-private:
-    bool delegate_sync_register_communicator(std::shared_ptr<communicator_interface> comm);
-
-    std::shared_ptr<host_communicator> ccl_communicator;
-    size_t expected_threads_count;
-    size_t expected_process_device_size;
-    group_unique_key unique_id;
-    std::shared_ptr<::native::process_group_context> ctx;
-
-    std::mutex thread_group_size_mutex;
-    std::condition_variable thread_group_size_cond;
-    std::vector<size_t> thread_device_group_sizes;
-    thread_comm_storage thread_communicators;
-    bool ready = false;
-
-    ::native::signal_context barrier;
-
-    //context_comm_addr bind_thread_addr;
-
-    static thread_local size_t thread_id;
-};
-} // namespace ccl
diff --git a/src/common/comm/l0/gpu_comm_utils.hpp b/src/common/comm/l0/gpu_comm_utils.hpp
deleted file mode 100644
index 62177e9ca..000000000
--- a/src/common/comm/l0/gpu_comm_utils.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <functional>
-
-#include "supported_topologies.hpp"
-#include "coll/algorithms/algorithms_enum.hpp"
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "common/comm/l0/modules/modules_source_data.hpp"
-
-namespace native {
-
-inline std::size_t module_hash(ccl_coll_type module_type,
-                               ccl::group_split_type group_id,
-                               ccl::device_topology_type class_id) {
-    std::string str = std::string(ccl_coll_type_to_str(module_type)) + "," + ::to_string(group_id) +
-                      ::to_string(class_id);
-    return std::hash<std::string>{}(str);
-}
-
-template <class communicator_type>
-struct module_loader {
-    template <ccl_coll_type... modules_types>
-    void load_modules(const modules_src_container<modules_types...>& sources) {
-        ccl_tuple_for_each(sources.get_modules(), *this);
-    }
-
-    // load specific module type
-    template <ccl_coll_type module_type>
-    void operator()(const typed_source_data_storage_t<module_type>& sources) {
-        for (auto it = sources.begin(); it != sources.end(); ++it) {
-            switch (it->first) {
-                case ccl::device_topology_type::ring:
-                    load_module_impl<module_type,
-                                     ccl::device_topology_type::ring,
-                                     ccl::group_split_type::thread,
-                                     ccl::group_split_type::process,
-                                     ccl::group_split_type::cluster>(it->second);
-                    break;
-                case ccl::device_topology_type::a2a:
-                    load_module_impl<module_type,
-                                     ccl::device_topology_type::a2a,
-                                     ccl::group_split_type::thread,
-                                     ccl::group_split_type::process,
-                                     ccl::group_split_type::cluster>(it->second);
-                    break;
-                default:
-                    throw std::runtime_error(std::string("unknown topology class: ") +
-                                             std::to_string(it->first));
-            }
-        }
-    }
-
-private:
-    communicator_type* get_this() {
-        return static_cast<communicator_type*>(this);
-    }
-
-    template <ccl_coll_type module_type,
-              ccl::device_topology_type class_id,
-              ccl::group_split_type... topology_types>
-    void load_module_impl(const source_data_t& module_data) {
-        LOG_DEBUG("Started loading module \"",
-                  ccl_coll_type_to_str(module_type),
-                  "\" for topology: \"",
-                  ::to_string(class_id),
-                  "\", for: ",
-                  communicator_type::name_impl())
-
-        ze_module_desc_t module_description;
-        module_description.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
-        module_description.pNext = nullptr;
-        module_description.format = ZE_MODULE_FORMAT_IL_SPIRV;
-        module_description.inputSize = module_data.size();
-        module_description.pInputModule = module_data.data();
-        module_description.pBuildFlags = nullptr;
-        module_description.pConstants = nullptr;
-
-        //compile modules TODO ring only
-        std::array<std::string, sizeof...(topology_types)> logs{
-            get_this()->template create_module_impl<module_type, topology_types, class_id>(
-                module_description)...
-        };
-
-        std::string accumulated_log;
-        if (!logs.empty()) {
-            accumulated_log =
-                std::accumulate(logs.begin(), logs.end(), std::string("\nLoading log:\n"));
-        }
-        LOG_DEBUG("Finished loading module \"",
-                  ccl_coll_type_to_str(module_type),
-                  "\" for topology: \"",
-                  ::to_string(class_id),
-                  "\" for: ",
-                  communicator_type::name_impl(),
-                  accumulated_log);
-    }
-};
-} // namespace native
diff --git a/src/common/comm/l0/gpu_device_types.hpp b/src/common/comm/l0/gpu_device_types.hpp
deleted file mode 100644
index d43c80b5e..000000000
--- a/src/common/comm/l0/gpu_device_types.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/device_types.hpp"
-#include "common/comm/l0/device_types_fwd.hpp"
-#include "common/comm/l0/device_containers.hpp"
-#include "common/comm/l0/device_containers_utils.hpp"
-
-namespace native {
-
-using specific_device_storage = device_storage_t<SUPPORTED_DEVICES_DECL_LIST>;
-using specific_plain_device_storage = plain_device_storage<SUPPORTED_DEVICES_DECL_LIST>;
-using specific_indexed_device_storage = indexed_device_storage<SUPPORTED_DEVICES_DECL_LIST>;
-using specific_device_variant_t = device_variant_t<SUPPORTED_DEVICES_DECL_LIST>;
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp b/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp
deleted file mode 100644
index 392d9c316..000000000
--- a/src/common/comm/l0/modules/a2a/allreduce_export_functions.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace a2a {
-
-namespace allreduce {
-
-/**
- * Common args for all kernel types
- */
-
-// own
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-template <class native_t>
-using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>;
-
-template <class native_t>
-using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
-
-using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-// right
-template <class native_t>
-using right_tmp_recv_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
-
-using right_income_data_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-
-using right_ready_to_recv_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel : public execution_kernel<main_kernel,
-                                             send_buf_size_arg,
-                                             send_buf_arg<void>,
-                                             recv_buf_arg<void>,
-                                             tmp_recv_buf_arg<void>,
-                                             income_data_flag_arg,
-                                             ready_to_recv_flag_arg,
-                                             local_barrier_flag_arg,
-                                             right_tmp_recv_buf_arg<void>,
-                                             right_income_data_flag_arg,
-                                             right_ready_to_recv_flag_arg> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
-
-    using base::base;
-};
-
-struct numa_kernel
-        : public execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  tmp_recv_buf_arg<void>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<void>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-
-                                  // numa-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 10, void*>,
-                                  permanent_arg<main_kernel_args::args_start_index + 11, int*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution_numa";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           stub_arg<main_kernel_args::args_start_index>,
-                                           stub_arg<main_kernel_args::args_start_index + 1>,
-                                           stub_arg<main_kernel_args::args_start_index + 2>,
-                                           tmp_recv_buf_arg<void>,
-                                           income_data_flag_arg,
-                                           ready_to_recv_flag_arg,
-                                           stub_arg<main_kernel_args::args_start_index + 6>,
-                                           stub_arg<main_kernel_args::args_start_index + 7>,
-                                           stub_arg<main_kernel_args::args_start_index + 8>,
-                                           stub_arg<main_kernel_args::args_start_index + 9>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "a2a_allreduce_ipc";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 stub_arg<main_kernel_args::args_start_index>,
-                                 stub_arg<main_kernel_args::args_start_index + 1>,
-                                 stub_arg<main_kernel_args::args_start_index + 2>,
-                                 tmp_recv_buf_arg<processing_type>,
-                                 income_data_flag_arg,
-                                 ready_to_recv_flag_arg,
-                                 stub_arg<main_kernel_args::args_start_index + 6>,
-                                 stub_arg<main_kernel_args::args_start_index + 7>,
-                                 stub_arg<main_kernel_args::args_start_index + 8>,
-                                 stub_arg<main_kernel_args::args_start_index + 9>>;
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel
-        : public execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  tmp_recv_buf_arg<void>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<void>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-
-                                  // scaleout-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 10, void*>,
-                                  permanent_arg<main_kernel_args::args_start_index + 11, int*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 10, processing_type*>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 11, int*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    using base::base;
-};
-
-} // namespace allreduce
-} // namespace a2a
-} // namespace native
diff --git a/src/common/comm/l0/modules/a2a/allreduce_module.hpp b/src/common/comm/l0/modules/a2a/allreduce_module.hpp
deleted file mode 100644
index 41c3eb648..000000000
--- a/src/common/comm/l0/modules/a2a/allreduce_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/a2a/allreduce_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_allreduce,
-                                 ccl::device_topology_type::a2a,
-                                 a2a::allreduce::main_kernel,
-                                 a2a::allreduce::numa_kernel,
-                                 a2a::allreduce::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_allreduce,
-                                 ccl::device_topology_type::a2a,
-                                 a2a::allreduce::ipc_kernel,
-                                 a2a::allreduce::ipc_kernel,
-                                 a2a::allreduce::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce,
-                                ccl::device_topology_type::a2a,
-                                a2a::allreduce::main_kernel,
-                                a2a::allreduce::numa_kernel,
-                                a2a::allreduce::scale_out_cpu_gw_kernel);
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/base_entry_module.cpp b/src/common/comm/l0/modules/base_entry_module.cpp
deleted file mode 100644
index 4ae5330a6..000000000
--- a/src/common/comm/l0/modules/base_entry_module.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <atomic>
-#include <map>
-#include <set>
-#include <tuple>
-
-#include "coll/coll.hpp"
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "common/comm/l0/modules/base_entry_module.hpp"
-#include "oneapi/ccl/native_device_api/l0/context.hpp"
-
-namespace native {
-gpu_module_base::gpu_module_base(handle module_handle) : module(module_handle) {}
-
-gpu_module_base::~gpu_module_base() {
-    release();
-}
-
-void gpu_module_base::release() {
-    //release imported functions at first
-    for (auto& f : functions) {
-        zeKernelDestroy(f.second);
-    }
-
-    // TODO: do the destroy in device.cpp through a wrapper instead of destroying the handle directly here
-    // Potentially there could be the case when we have several gpu_module_base objects referencing the
-    // same handle and such change will allow to avoid issues related to that.
-    if (module) {
-        zeModuleDestroy(module);
-        module = nullptr;
-    }
-    functions.clear();
-}
-
-gpu_module_base::handle gpu_module_base::get() const {
-    return module;
-}
-
-ze_kernel_handle_t gpu_module_base::import_kernel(const std::string& name) {
-    ze_kernel_desc_t desc = {
-        .stype = ZE_STRUCTURE_TYPE_KERNEL_DESC,
-        .pNext = nullptr,
-        .flags = 0,
-    };
-    desc.pKernelName = name.c_str();
-    ze_kernel_handle_t handle;
-
-    if (!module) {
-        return nullptr;
-    }
-
-    ze_result_t result = zeKernelCreate(module, &desc, &handle);
-    if (result != ZE_RESULT_SUCCESS) {
-        CCL_THROW("Cannot create kernel: ", name, ", error: ", native::to_string(result));
-    }
-
-    //TODO avoid duplicates
-    std::string imported_name = name + std::to_string((size_t)handle);
-    functions.emplace(std::piecewise_construct,
-                      std::forward_as_tuple(imported_name),
-                      std::forward_as_tuple(handle));
-    return handle;
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/base_entry_module.hpp b/src/common/comm/l0/modules/base_entry_module.hpp
deleted file mode 100644
index 8d2adf135..000000000
--- a/src/common/comm/l0/modules/base_entry_module.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/type_traits.hpp"
-#include "supported_topologies.hpp"
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-//module, collection of functions
-struct gpu_module_base {
-    using handle = ze_module_handle_t;
-    using kernel_handle = ze_kernel_handle_t;
-    using imported_kernels = std::map<std::string, kernel_handle>;
-
-    gpu_module_base(handle module_handle);
-    ~gpu_module_base();
-
-    handle get() const;
-    void release();
-    kernel_handle import_kernel(const std::string& name);
-
-    handle module;
-    imported_kernels functions;
-};
-
-//specific type module implementations:
-//1) in-process gpu module
-template <ccl_coll_type type, ccl::group_split_type topology, ccl::device_topology_type mode>
-struct device_coll_module : private gpu_module_base {
-    static constexpr ccl_coll_type get_coll_type() {
-        return type;
-    }
-    static constexpr ccl::group_split_type get_topology_type() {
-        return topology;
-    }
-    static constexpr ccl::device_topology_type get_topology_class() {
-        return mode;
-    }
-
-    device_coll_module(handle module_handle) : gpu_module_base(module_handle) {}
-};
-
-//2) out-of-process gpu module
-template <ccl_coll_type type, ccl::group_split_type topology, ccl::device_topology_type mode>
-struct ipc_dst_device_coll_module : private gpu_module_base {
-    static constexpr ccl_coll_type get_coll_type() {
-        return type;
-    }
-    static constexpr ccl::group_split_type get_topology_type() {
-        return topology;
-    }
-    static constexpr ccl::device_topology_type get_topology_class() {
-        return mode;
-    }
-
-    ipc_dst_device_coll_module(handle module_handle) : gpu_module_base(module_handle) {}
-};
-
-//3) virtual gpu module
-template <ccl_coll_type type, ccl::group_split_type topology, ccl::device_topology_type mode>
-struct virtual_device_coll_module {
-    static constexpr ccl_coll_type get_coll_type() {
-        return type;
-    }
-    static constexpr ccl::group_split_type get_topology_type() {
-        return topology;
-    }
-    static constexpr ccl::device_topology_type get_topology_class() {
-        return mode;
-    }
-
-    virtual_device_coll_module(
-        std::shared_ptr<device_coll_module<type, topology, mode>> real_module)
-            : real_module_ref(real_module) {}
-    std::shared_ptr<device_coll_module<type, topology, mode>> real_module_ref;
-};
-
-template <ccl_coll_type type, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct coll_module_traits {
-    static constexpr ccl_coll_type coll_type() {
-        return type;
-    }
-    static constexpr ccl::group_split_type group_type() {
-        return group_id;
-    }
-    static constexpr ccl::device_topology_type topology_class() {
-        return class_id;
-    }
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/gpu_typed_module.hpp b/src/common/comm/l0/modules/gpu_typed_module.hpp
deleted file mode 100644
index 9825a527e..000000000
--- a/src/common/comm/l0/modules/gpu_typed_module.hpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <type_traits>
-#include <memory>
-#include "common/comm/l0/modules/base_entry_module.hpp"
-#include "common/comm/l0/modules/modules_utils.hpp"
-#include "common/comm/l0/communicator/base_communicator.hpp"
-
-#include "common/comm/l0/modules/kernel_class.hpp"
-
-namespace native {
-
-template <ccl_coll_type type,
-          //   template <typename>
-          class kernel_function_impl,
-          //   template <typename>
-          class kernel_numa_function_impl,
-          //   template <typename>
-          class kernel_scale_out_cpu_gw_function_impl>
-struct real_gpu_typed_module : private gpu_module_base,
-                               public kernel_class<type, kernel_function_impl>,
-                               public kernel_class<type, kernel_numa_function_impl>,
-                               public kernel_class<type, kernel_scale_out_cpu_gw_function_impl> {
-    using handle = gpu_module_base::handle;
-
-    using main_class = kernel_class<type, kernel_function_impl>;
-    using numa_class = kernel_class<type, kernel_numa_function_impl>;
-    using scale_out_cpu_gw_class = kernel_class<type, kernel_scale_out_cpu_gw_function_impl>;
-
-    using self_t = real_gpu_typed_module;
-
-    real_gpu_typed_module(handle module_handle) : gpu_module_base(module_handle) {
-        LOG_DEBUG("Real gpu module created: ",
-                  ccl_coll_type_to_str(type),
-                  ", modules handle: ",
-                  (void*)module);
-
-        // TODO: is there a nicer way to iterate?
-        for (auto&& kernel_node : main_class::value) {
-            detail::kernel_entry_initializer<type>(
-                [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                    return this->import_kernel(name);
-                })(kernel_node.second);
-        }
-        // ccl_tuple_for_each(main_class::value,
-        //                    detail::kernel_entry_initializer<type>(
-        //                        [this](const std::string& name) -> gpu_module_base::kernel_handle {
-        //                            return this->import_kernel(name);
-        //                        }));
-
-        /*ccl_tuple_for_each(numa_class::value,
-                           detail::kernel_entry_initializer<type>(
-                               [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                                   return this->import_kernel(name);
-                               }));*/
-
-        LOG_DEBUG("Imported functions count: ", functions.size());
-    }
-
-    handle get() const {
-        return module;
-    }
-
-    template <class specific_kernel_class>
-    specific_kernel_class& get_class() {
-        static_assert(
-            std::is_base_of<specific_kernel_class, self_t>::value,
-            "Relationship IS-A `specific_kernel_class` for `real_gpu_typed_module` failed");
-        return static_cast<specific_kernel_class&>(*this);
-    }
-
-    ~real_gpu_typed_module() {
-        LOG_DEBUG("Real gpu module destroyed: ",
-                  ccl_coll_type_to_str(type),
-                  ", modules handle: ",
-                  (void*)module);
-    }
-};
-
-//2) virtual ipc_gpu_typed_module
-template <ccl_coll_type type,
-          //   template <typename>
-          class kernel_function_impl,
-          //   template <typename>
-          class kernel_numa_function_impl,
-          //   template <typename>
-          class kernel_scale_out_cpu_gw_function_impl>
-struct ipc_gpu_typed_module : private gpu_module_base,
-                              public kernel_class<type, kernel_function_impl> {
-    using main_class = kernel_class<type, kernel_function_impl>;
-
-    using self_t = ipc_gpu_typed_module;
-
-    using handle = gpu_module_base::handle;
-
-    ipc_gpu_typed_module(handle module_handle) : gpu_module_base(nullptr) {
-        LOG_DEBUG("Remote gpu module created: ", ccl_coll_type_to_str(type));
-        // ccl_tuple_for_each(main_class::value,
-        //                    detail::kernel_entry_initializer<type>(
-        //                        [](const std::string& name) -> gpu_module_base::kernel_handle {
-        //                            return nullptr;
-        //                        }));
-        for (auto&& kernel : main_class::value) {
-            detail::kernel_entry_initializer<type>(
-                [](const std::string& name) -> gpu_module_base::kernel_handle {
-                    return nullptr;
-                })(kernel.second);
-        }
-        LOG_DEBUG("No need to import functions");
-    }
-
-    template <class specific_kernel_class>
-    specific_kernel_class& get_class() {
-        static_assert(
-            std::is_base_of<specific_kernel_class, self_t>::value,
-            "Relationship IS-A `specific_kernel_class` for `ipc_gpu_typed_module` failed");
-        return static_cast<specific_kernel_class&>(*this);
-    }
-
-    ~ipc_gpu_typed_module() = default;
-};
-
-//3) virtual gpu module
-template <ccl_coll_type type,
-          //   template <typename>
-          class kernel_function_impl,
-          //   template <typename>
-          class kernel_numa_function_impl,
-          //   template <typename>
-          class kernel_scale_out_cpu_gw_function_impl>
-struct virtual_gpu_typed_module : private gpu_module_base,
-                                  public kernel_class<type, kernel_function_impl>,
-                                  public kernel_class<type, kernel_numa_function_impl>,
-                                  public kernel_class<type, kernel_scale_out_cpu_gw_function_impl> {
-    // TODO: use real_referenced_module to reduce given params
-    using real_referenced_module = real_gpu_typed_module<type,
-                                                         kernel_function_impl,
-                                                         kernel_numa_function_impl,
-                                                         kernel_scale_out_cpu_gw_function_impl>;
-
-    using main_class = kernel_class<type, kernel_function_impl>;
-    using numa_class = kernel_class<type, kernel_numa_function_impl>;
-    using scale_out_cpu_gw_class = kernel_class<type, kernel_scale_out_cpu_gw_function_impl>;
-
-    using self_t = virtual_gpu_typed_module;
-
-    using handle = typename real_referenced_module::handle;
-
-    virtual_gpu_typed_module(std::shared_ptr<real_referenced_module> real_module)
-            : gpu_module_base(real_module->get()),
-              real_module_ref(real_module) {
-        LOG_DEBUG("Virtual gpu module created:", ccl_coll_type_to_str(type));
-        // ccl_tuple_for_each(main_class::value,
-        //                    detail::kernel_entry_initializer<type>(
-        //                        [this](const std::string& name) -> gpu_module_base::kernel_handle {
-        //                            return this->import_kernel(name);
-        //                        }));
-        for (auto&& kernel : main_class::value) {
-            detail::kernel_entry_initializer<type>(
-                [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                    return this->import_kernel(name);
-                })(kernel.second);
-        }
-        /*ccl_tuple_for_each(numa_class::value,
-                           detail::kernel_entry_initializer<type>(
-                               [this](const std::string& name) -> gpu_module_base::kernel_handle {
-                                   return this->import_kernel(name);
-                               }));*/
-
-        LOG_DEBUG("Linked functions count: ", functions.size());
-    }
-
-    template <class specific_kernel_class>
-    specific_kernel_class& get_class() {
-        static_assert(
-            std::is_base_of<specific_kernel_class, self_t>::value,
-            "Relationship IS-A `specific_kernel_class` for `virtual_gpu_typed_module` failed");
-        return static_cast<specific_kernel_class&>(*this);
-    }
-
-    std::shared_ptr<real_referenced_module> real_module_ref;
-
-    ~virtual_gpu_typed_module() {
-        LOG_DEBUG("Virtual gpu module destroyed: ",
-                  ccl_coll_type_to_str(type),
-                  ", modules handle: ",
-                  (void*)module);
-        module = nullptr; //real module owner will destroy it
-        release();
-    }
-};
-
-#define DEFINE_SPECIFIC_GPU_MODULE_CLASS(module_type, \
-                                         base_module_type, \
-                                         coll_type, \
-                                         mode, \
-                                         export_function, \
-                                         export_numa_function, \
-                                         export_scale_out_cpu_gw_function) \
-    template <ccl::group_split_type topology> \
-    struct module_type<coll_type, topology, mode> \
-            : public base_module_type<coll_type, \
-                                      export_function, \
-                                      export_numa_function, \
-                                      export_scale_out_cpu_gw_function> { \
-        using base = base_module_type<coll_type, \
-                                      export_function, \
-                                      export_numa_function, \
-                                      export_scale_out_cpu_gw_function>; \
-        using base::handle; \
-\
-        module_type<coll_type, topology, mode>(handle module_handle) : base(module_handle) {} \
-    }
-
-#define DEFINE_VIRTUAL_GPU_MODULE_CLASS( \
-    coll_type, mode, export_function, export_numa_function, export_scale_out_cpu_gw_function) \
-    template <ccl::group_split_type topology> \
-    struct virtual_device_coll_module<coll_type, topology, mode> \
-            : public virtual_gpu_typed_module<coll_type, \
-                                              export_function, \
-                                              export_numa_function, \
-                                              export_scale_out_cpu_gw_function> { \
-        using base = virtual_gpu_typed_module<coll_type, \
-                                              export_function, \
-                                              export_numa_function, \
-                                              export_scale_out_cpu_gw_function>; \
-        using base::handle; \
-        using real_referenced_module = typename base::real_referenced_module; \
-\
-        virtual_device_coll_module<coll_type, topology, mode>( \
-            std::shared_ptr<real_referenced_module> real_module) \
-                : base(real_module) {} \
-    }
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/kernel_argument_policies.hpp b/src/common/comm/l0/modules/kernel_argument_policies.hpp
deleted file mode 100644
index 48b2b0f53..000000000
--- a/src/common/comm/l0/modules/kernel_argument_policies.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <atomic>
-
-#include "common/utils/tuple.hpp"
-#include "common/log/log.hpp"
-
-namespace native {
-/*
- * Define arguments read/write by host policy
- */
-template <size_t pos, class ArgType, bool must_exist = true>
-struct arg_access_policy_default {
-    using arg_type = ArgType;
-    using return_t = std::pair<bool, arg_type>;
-    void store(const arg_type &value) {
-        arg_value = value;
-        charged = true;
-    }
-
-    inline bool test() const noexcept {
-        return charged;
-    }
-
-    return_t load() const {
-        return_t ret{ false, arg_type{} };
-        if (!test()) {
-            if (must_exist) {
-                abort();
-                CCL_THROW("Cannot get non-existent kernel argument by index: ", pos);
-            }
-            return ret;
-        }
-        std::get<0>(ret) = true;
-        std::get<1>(ret) = arg_value;
-        return ret;
-    }
-
-private:
-    arg_type arg_value{};
-    bool charged = false;
-};
-
-template <size_t pos, class ArgType, bool must_exist = true>
-struct arg_access_policy_atomic {
-    using arg_type = ArgType;
-    using return_t = std::pair<bool, arg_type>;
-    using throwable = std::integral_constant<bool, must_exist>;
-    void store(const arg_type &value) {
-        arg_value.store(value, std::memory_order_relaxed); //relaxes
-        charged.store(true, std::memory_order_release);
-    }
-
-    bool test() const noexcept {
-        return charged.load(std::memory_order_acquire);
-    }
-
-    return_t load() const {
-        return_t ret{ false, arg_type{} };
-        if (!test()) {
-            if (must_exist) {
-                CCL_THROW("Cannot get non-existent kernel atomic argument by index:", pos);
-            }
-            return ret;
-        }
-
-        std::get<0>(ret) = true;
-        std::get<1>(ret) = arg_value.load(std::memory_order_relaxed);
-        return ret;
-    }
-
-protected:
-    std::atomic<arg_type> arg_value{};
-    std::atomic<bool> charged{ false };
-};
-
-// Policy that invalidates the value once it's loaded by a consumer.
-// It remains invalid for read untill a producer writes an new one
-// Note: only one read/invalidate is supported
-template <size_t pos, class ArgType, bool must_exist = true>
-struct arg_access_policy_atomic_reset : public arg_access_policy_atomic<pos, ArgType, must_exist> {
-    using base = arg_access_policy_atomic<pos, ArgType, must_exist>;
-    using return_t = typename base::return_t;
-
-    return_t load() noexcept {
-        auto res = base::load();
-        reset();
-        return res;
-    }
-
-    void reset() noexcept {
-        base::charged.store(false, std::memory_order_release);
-    }
-
-    void dump(std::ostream &out) const {
-        if (base::test()) {
-            auto ret = base::load();
-            out << "{ " << ret.second << " }";
-        }
-        else {
-            out << "{ RESET (" << base::arg_value << ")}";
-        }
-    }
-};
-
-template <size_t pos, class ArgType, bool must_exist = true>
-struct arg_access_policy_atomic_move {
-    using arg_type = ArgType;
-    using return_t = std::pair<bool, arg_type>;
-    using throwable = std::integral_constant<bool, must_exist>;
-    void store(const arg_type &value) {
-        //#ifdef DEBUG
-        charged_counter.fetch_add(std::memory_order_relaxed);
-        //#endif
-        arg_value.store(value, std::memory_order_relaxed); //relaxes
-        charged.store(true, std::memory_order_release);
-    }
-
-    inline bool test() const noexcept {
-        return charged.load(std::memory_order_acquire);
-    }
-
-    return_t load() {
-        return_t ret{ false, arg_type{} };
-        if (charged.exchange(false)) //destructive load should be done for `charge` only
-        {
-            std::get<0>(ret) = true;
-            std::get<1>(ret) = arg_value.load(std::memory_order_relaxed);
-            //#ifdef DEBUG
-            consumed_counter.fetch_add(std::memory_order_relaxed);
-            //#endif
-        }
-        return ret;
-    }
-
-private:
-    void dump(std::ostream &out) const {
-        out << "{ arg_value.load(std::memory_order_relaxed) , set: " << charged_counter.load()
-            << ", get: " << consumed_counter.load() << "}";
-    }
-
-    std::atomic<arg_type> arg_value{};
-    std::atomic<bool> charged{ false };
-
-    std::atomic<size_t> charged_counter{};
-    std::atomic<size_t> consumed_counter{};
-};
-
-template <size_t pos>
-struct arg_no_access_policy {
-    using arg_type = void;
-    using return_t = bool;
-
-    void store(...);
-    bool test() const noexcept;
-    return_t load() const;
-};
-} // namespace native
diff --git a/src/common/comm/l0/modules/kernel_argument_types.hpp b/src/common/comm/l0/modules/kernel_argument_types.hpp
deleted file mode 100644
index b852811bc..000000000
--- a/src/common/comm/l0/modules/kernel_argument_types.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_argument_policies.hpp"
-
-namespace native {
-
-namespace options {
-
-// Options for kernel arguments, here we should defile all the aspects that don't change parameter's
-// behaviour, i.e. don't affect load/store functions such. Things that does that(i.e. thread-safety)
-// should be defined as separate policy.
-template <bool uncached = false>
-struct generic {
-    static constexpr bool is_uncached() {
-        return uncached;
-    }
-};
-
-using empty = generic<>;
-using uncached = generic<true>;
-
-} // namespace options
-
-// base class for kernel argument
-template <size_t pos, class policy_impl, class options = options::empty>
-struct kernel_arg : public policy_impl, options {
-    enum { index = pos };
-    using policy = policy_impl;
-    using arg_type = typename policy::arg_type;
-    using return_t = typename policy::return_t;
-    using options_t = options;
-};
-
-// thread-safe argument: used for concurrent read/write applications
-template <size_t pos, class type, class options = options::empty>
-using thread_safe_arg = kernel_arg<pos, arg_access_policy_atomic<pos, type, false>, options>;
-
-// thread-safe destructive-copying argument (rechargeable): used for concurrent
-// read/write applications, where reader take-away existing value
-template <size_t pos, class type, class options = options::empty>
-using thread_exchangable_arg =
-    kernel_arg<pos, arg_access_policy_atomic_reset<pos, type, false>, options>;
-
-// external argument using uncached flag and reset policy
-template <size_t pos, class type, class options = options::uncached>
-using external_arg = kernel_arg<pos, arg_access_policy_atomic_reset<pos, type, false>, options>;
-
-// uncached permanent argument using uncached flag and no reset policy
-template <size_t pos, class type, class options = options::uncached>
-using permanent_arg = kernel_arg<pos, arg_access_policy_atomic<pos, type, false>, options>;
-
-// default, single threaded access argument
-template <size_t pos, class type, class options = options::empty>
-using arg = kernel_arg<pos, arg_access_policy_default<pos, type>, options>;
-
-// empty argument
-template <size_t pos, class options = options::empty>
-using stub_arg = kernel_arg<pos, arg_no_access_policy<pos>, options>;
-
-// utilities
-namespace detail {
-struct args_printer {
-    args_printer(std::stringstream& ss) : out(ss) {}
-
-    template <typename Arg>
-    void operator()(const Arg& arg) {
-        out << "idx: " << Arg::index << "\t";
-        dump_arg_value(arg, out);
-        using opt = typename Arg::options_t;
-        print_options(opt{}, out);
-        out << std::endl;
-    }
-
-    // atomic argument pretty printing
-    template <size_t pos, class type, class options>
-    void operator()(const thread_safe_arg<pos, type, options>& arg) {
-        out << "idx: " << pos << "\t";
-        dump_arg_value(arg, out);
-        print_options(options{}, out);
-        out << "\tATOMIC" << std::endl;
-    }
-
-    template <size_t pos, class type, class options>
-    void operator()(const thread_exchangable_arg<pos, type, options>& arg) {
-        out << "idx: " << pos << "\t";
-        arg.dump(out);
-        print_options(options{}, out);
-        out << "\tATOMIC_EXG" << std::endl;
-    }
-
-    // stub argument pretty printing
-    template <size_t pos, class options>
-    void operator()(const stub_arg<pos, options>& arg) {
-        out << "idx: " << pos;
-        print_options(options{}, out);
-        out << "\tSTUB" << std::endl;
-    }
-    std::stringstream& out;
-
-private:
-    template <typename Arg>
-    void dump_arg_value(const Arg& arg, std::stringstream& ss) {
-        if (arg.test()) {
-            auto ret = arg.load();
-            ss << "{ " << ret.second << " }";
-        }
-        else {
-            ss << "{ EMPTY }";
-        }
-    }
-
-    template <class Options>
-    void print_options(Options opt, std::stringstream& ss) {
-        if (opt.is_uncached())
-            ss << "\tUNCACHED ";
-    }
-};
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/modules/kernel_class.hpp b/src/common/comm/l0/modules/kernel_class.hpp
deleted file mode 100644
index 669e8ade5..000000000
--- a/src/common/comm/l0/modules/kernel_class.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <tuple>
-#include "common/utils/tuple.hpp"
-#include <unordered_map>
-
-namespace native {
-
-template <ccl_coll_type type, class kernel_function_impl, class Enable = void>
-struct kernel_class {
-    using kernel_t = kernel_function_impl;
-
-    using key_type = ccl::datatype;
-
-    struct hasher {
-        size_t operator()(const ccl::datatype& dtype) const {
-            return std::hash<size_t>{}((size_t)dtype);
-        }
-    };
-
-    using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>;
-
-    kernel_class() {
-        for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) {
-            key_type key{ idx };
-            // Have to use this ugly inplace construction because kernel_t have deleted copy and move
-            // constructor and there is no other way to do that.
-            value.emplace(std::piecewise_construct,
-                          std::make_tuple(key),
-                          std::make_tuple(coll_param_gpu(type, idx)));
-        }
-    }
-    // getter
-    kernel_t& get(const coll_param_gpu& params) {
-        assert(!params.is_reduction());
-        key_type key{ params.get_datatype() };
-
-        auto it = value.find(key);
-        if (it == value.end()) {
-            // TODO: sycl error
-            throw std::runtime_error("Kernel not found");
-        }
-
-        return it->second;
-    }
-
-protected:
-    kernel_class_container_t value;
-};
-
-template <ccl_coll_type type, class kernel_function_impl>
-struct kernel_class<type,
-                    kernel_function_impl,
-                    typename std::enable_if<is_reduction_coll_type<type>::value>::type> {
-    using kernel_t = kernel_function_impl;
-
-    using key_type = std::pair<ccl::datatype, ccl::reduction>;
-
-    struct hasher {
-        size_t operator()(const std::pair<ccl::datatype, ccl::reduction>& key) const {
-            return std::hash<size_t>{}((size_t)key.first) ^ std::hash<size_t>{}((size_t)key.second);
-        }
-    };
-
-    using kernel_class_container_t = std::unordered_map<key_type, kernel_t, hasher>;
-
-    kernel_class() {
-        for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::bfloat16; idx++) {
-            // TODO: allow to iterate over reduction values(need to implement operator++)
-            auto insert_kernel = [this, idx](ccl::reduction red) {
-                key_type key{ idx, red };
-                value.emplace(std::piecewise_construct,
-                              std::make_tuple(key),
-                              std::make_tuple(coll_param_gpu(type, idx, red)));
-            };
-
-            insert_kernel(ccl::reduction::sum);
-            insert_kernel(ccl::reduction::prod);
-            insert_kernel(ccl::reduction::min);
-            insert_kernel(ccl::reduction::max);
-        }
-    }
-
-    // getter
-    kernel_t& get(const coll_param_gpu& params) {
-        assert(params.is_reduction());
-
-        key_type key{ params.get_datatype(), params.get_reduction() };
-
-        auto it = value.find(key);
-        if (it == value.end()) {
-            // TODO: sycl error
-            throw std::runtime_error("Kernel not found");
-        }
-
-        return it->second;
-    }
-
-protected:
-    // TODO: threadsafety? Looks like this should be fine as different threads access different devices.
-    // Need to double check IPC/NUMA case.
-    kernel_class_container_t value;
-};
-
-} //namespace native
diff --git a/src/common/comm/l0/modules/kernel_functions.hpp b/src/common/comm/l0/modules/kernel_functions.hpp
deleted file mode 100644
index 8e9c8d446..000000000
--- a/src/common/comm/l0/modules/kernel_functions.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_argument_types.hpp"
-#include "coll/coll_param.hpp"
-
-namespace native {
-// kernel with its argument collection
-template <class... arguments>
-struct kernel_data_storage {
-    using func_args_t = std::tuple<arguments...>;
-    func_args_t args;
-    ze_kernel_handle_t handle;
-
-    // generic getter
-    template <class kernel_argument>
-    typename kernel_argument::return_t get_arg() {
-        return std::get<kernel_argument::index>(args).load();
-    }
-
-    // generic test value
-    template <class... kernel_argument>
-    bool test_args() const {
-        std::array<bool, sizeof...(kernel_argument)> ready{
-            std::get<kernel_argument::index>(args).test()...
-        };
-
-        return std::all_of(ready.begin(), ready.end(), [](bool v) {
-            return v;
-        });
-    }
-
-    // generic setter
-    template <class kernel_argument,
-              class = typename std::enable_if<
-                  not std::is_pointer<typename kernel_argument::arg_type>::value>::type>
-    void set_arg(typename kernel_argument::arg_type& new_val) {
-        std::get<kernel_argument::index>(args).store(new_val);
-    }
-
-    template <class kernel_argument,
-              class = typename std::enable_if<
-                  std::is_pointer<typename kernel_argument::arg_type>::value>::type>
-    void set_arg(typename kernel_argument::arg_type new_val) {
-        std::get<kernel_argument::index>(args).store(new_val);
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "handle: " << handle << "\n{\n";
-        detail::args_printer func(ss);
-        ccl_tuple_for_each(args, func);
-        ss << "}" << std::endl;
-        return ss.str();
-    }
-};
-
-// major kernel args
-enum main_kernel_args { rank_index = 0, size_index = 1, args_start_index };
-
-class kernel_parameters_holder {
-    coll_param_gpu params;
-
-public:
-    kernel_parameters_holder(const coll_param_gpu& params) : params{ params } {}
-
-    const coll_param_gpu& get_kernel_params() const {
-        return params;
-    }
-};
-
-//main kernel - used for GPU program execution
-template <class Impl, class... arguments>
-struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
-                                                     arg<main_kernel_args::size_index, int>,
-                                                     arguments...>,
-                          public kernel_parameters_holder {
-    using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
-                                     arg<main_kernel_args::size_index, int>,
-                                     arguments...>;
-    using base::args;
-    using base::handle;
-
-    using params_base = kernel_parameters_holder;
-
-    execution_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {}
-
-    using rank_type = int;
-    using size_type = int;
-
-    const char* name() {
-        return Impl::specific_name();
-    }
-
-    void set_rank(rank_type rank) {
-        ze_result_t result = zeKernelSetArgumentValue(
-            handle, main_kernel_args::rank_index, sizeof(rank_type), &rank);
-        if (result != ZE_RESULT_SUCCESS) {
-            CCL_THROW("Cannot set rank_argument argument in kernel: ", name());
-        }
-
-        std::get<main_kernel_args::rank_index>(args).store(rank);
-    }
-
-    rank_type get_rank() const {
-        return std::get<main_kernel_args::rank_index>(args).load().second;
-    }
-
-    void set_size(size_type size) {
-        ze_result_t result = zeKernelSetArgumentValue(
-            handle, main_kernel_args::size_index, sizeof(size_type), &size);
-        if (result != ZE_RESULT_SUCCESS) {
-            CCL_THROW("Cannot set size_argument argument in kernel: ", name());
-        }
-
-        std::get<main_kernel_args::size_index>(args).store(size);
-    }
-
-    size_type get_size() const {
-        return std::get<main_kernel_args::size_index>(args).load().second;
-    }
-
-    // modified setter
-    template <class kernel_argument,
-              class = typename std::enable_if<
-                  not std::is_pointer<typename kernel_argument::arg_type>::value>::type>
-    void set_arg(typename kernel_argument::arg_type& new_val) {
-        LOG_TRACE("Function: ",
-                  name(),
-                  ", handle: ",
-                  handle,
-                  " - set_arg() index: ",
-                  kernel_argument::index,
-                  ", value: ",
-                  new_val);
-        ze_result_t result = zeKernelSetArgumentValue(
-            handle, kernel_argument::index, sizeof(typename kernel_argument::arg_type), &new_val);
-        if (result != ZE_RESULT_SUCCESS) {
-            CCL_THROW("Cannot set kernel argument by index: ",
-                      kernel_argument::index,
-                      ", kernel: ",
-                      name(),
-                      ", handle: ",
-                      handle,
-                      ", result: ",
-                      result);
-        }
-
-        base::template set_arg<kernel_argument>(new_val);
-    }
-
-    template <class kernel_argument,
-              class = typename std::enable_if<
-                  std::is_pointer<typename kernel_argument::arg_type>::value>::type>
-    void set_arg(typename kernel_argument::arg_type new_val) {
-        LOG_TRACE("Function: ",
-                  name(),
-                  ", handle: ",
-                  handle,
-                  " - set_arg(pointer) index: ",
-                  kernel_argument::index,
-                  ", value: ",
-                  new_val);
-        ze_result_t result = zeKernelSetArgumentValue(handle,
-                                                      kernel_argument::index,
-                                                      sizeof(typename kernel_argument::arg_type),
-                                                      &new_val); //& from pointer
-        if (result != ZE_RESULT_SUCCESS) {
-            CCL_THROW("Cannot set kernel argument by pointer index: ",
-                      kernel_argument::index,
-                      ", kernel: ",
-                      name(),
-                      ", handle: ",
-                      handle,
-                      ", result: ",
-                      result,
-                      "value: ",
-                      new_val);
-        }
-
-        base::template set_arg<kernel_argument>(new_val);
-    }
-
-    template <class... kernel_argument>
-    void set_args(typename kernel_argument::arg_type... new_val) {
-        std::array<bool, sizeof...(kernel_argument)> expander{ (
-            this->template set_arg<kernel_argument>(new_val), true)... };
-        (void)expander;
-    }
-};
-
-// base_ipc_kernel - used for GPU data synchronization only
-template <class Impl, class... arguments>
-struct base_ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
-                                                    arg<main_kernel_args::size_index, int>,
-                                                    arguments...>,
-                         public kernel_parameters_holder {
-    using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
-                                     arg<main_kernel_args::size_index, int>,
-                                     arguments...>;
-    using base::args;
-    using base::handle;
-
-    using params_base = kernel_parameters_holder;
-
-    base_ipc_kernel(const coll_param_gpu& params) : base{}, params_base{ params } {}
-
-    static constexpr const char* name() {
-        return Impl::specific_name();
-    }
-
-    template <class kernel_argument,
-              class = typename std::enable_if<not std::is_pointer<kernel_argument>::value>::type>
-    void set_arg(typename kernel_argument::arg_type& new_val) {
-        LOG_TRACE("Function: ",
-                  name(),
-                  ", handle: ",
-                  handle,
-                  " - set_arg() index: ",
-                  kernel_argument::index,
-                  ", value: ",
-                  new_val);
-        base::template set_arg<kernel_argument>(new_val);
-    }
-
-    template <class kernel_argument,
-              class = typename std::enable_if<std::is_pointer<kernel_argument>::value>::type>
-    void set_arg(typename kernel_argument::arg_type new_val) {
-        LOG_TRACE("Function: ",
-                  name(),
-                  ", handle: ",
-                  handle,
-                  " - set_arg(pointer) index: ",
-                  kernel_argument::index,
-                  ", value: ",
-                  new_val);
-        base::template set_arg<kernel_argument>(new_val);
-    }
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/modules/kernel_utils.cpp b/src/common/comm/l0/modules/kernel_utils.cpp
deleted file mode 100644
index ce1f4c5ae..000000000
--- a/src/common/comm/l0/modules/kernel_utils.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/modules/kernel_utils.hpp"
-#include "common/global/global.hpp"
-
-namespace native {
-namespace detail {
-
-std::string to_string(ccl::reduction red) {
-#define P(val) \
-    case ccl::reduction::val: return #val;
-
-    switch (red) {
-        P(sum);
-        P(prod);
-        P(min);
-        P(max);
-        default:
-            throw std::runtime_error("Unexpected value of reduction: " +
-                                     std::to_string(static_cast<int>(red)));
-    }
-
-#undef P
-}
-
-// TODO: ideally we should take a set of all parameters and generate a kernel name
-// to execute
-std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params) {
-    // TODO: introduce a simple function to map names?
-    // Can we remove dtypes from global_data then? Do we need custom datatypes?
-    auto name = kernel_name + "_" + ccl::global_data::get().dtypes->name(params.get_datatype());
-    if (params.is_reduction()) {
-        name += "_" + to_string(params.get_reduction());
-    }
-
-    return name;
-}
-
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/modules/kernel_utils.hpp b/src/common/comm/l0/modules/kernel_utils.hpp
deleted file mode 100644
index 6acef3623..000000000
--- a/src/common/comm/l0/modules/kernel_utils.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <string>
-
-#include "coll/coll_param.hpp"
-
-namespace native {
-namespace detail {
-
-std::string to_string(ccl::reduction red);
-std::string get_kernel_name(const std::string& kernel_name, const coll_param_gpu& params);
-
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/modules/modules_source_data.cpp b/src/common/comm/l0/modules/modules_source_data.cpp
deleted file mode 100644
index 8116455c4..000000000
--- a/src/common/comm/l0/modules/modules_source_data.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <stdexcept>
-#include <fstream>
-#include "common/comm/l0/modules/modules_source_data.hpp"
-
-namespace native {
-source_data_t load_binary_file(const std::string& source_path) {
-    std::ifstream stream(source_path, std::ios::in | std::ios::binary);
-
-    source_data_t binary_file;
-    if (!stream.good()) {
-        std::string error("Failed to load binary file: ");
-        error += source_path;
-
-        throw std::runtime_error(error);
-    }
-
-    size_t length = 0;
-    stream.seekg(0, stream.end);
-    length = static_cast<size_t>(stream.tellg());
-    stream.seekg(0, stream.beg);
-
-    binary_file.resize(length);
-    stream.read(reinterpret_cast<char*>(binary_file.data()), length);
-    return binary_file;
-}
-} // namespace native
diff --git a/src/common/comm/l0/modules/modules_source_data.hpp b/src/common/comm/l0/modules/modules_source_data.hpp
deleted file mode 100644
index 4aa36c626..000000000
--- a/src/common/comm/l0/modules/modules_source_data.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "coll/algorithms/algorithms_enum.hpp"
-#include "common/comm/l0/device_group_routing_schema.hpp"
-
-namespace native {
-using source_data_t = std::vector<uint8_t>;
-using source_data_storage_t = std::map<ccl::device_topology_type, source_data_t>;
-
-template <ccl_coll_type module_type>
-struct typed_source_data_storage_t : source_data_storage_t {
-    static constexpr ccl_coll_type get_type() {
-        return module_type;
-    }
-};
-
-source_data_t load_binary_file(const std::string& source_path);
-
-template <ccl_coll_type... modules_types>
-class modules_src_container {
-public:
-    using modules_src_tuple = std::tuple<typed_source_data_storage_t<modules_types>...>;
-
-    static modules_src_container& instance() {
-        static modules_src_container inst;
-        return inst;
-    }
-
-    template <ccl_coll_type type>
-    void load_kernel_source(const std::string& source_path, ccl::device_topology_type method) {
-        static_assert(type < std::tuple_size<modules_src_tuple>::value,
-                      " Module of 'type' is not supported");
-        auto binary_file = load_binary_file(source_path);
-        typed_source_data_storage_t<type>& typed_storage = std::get<type>(storage);
-
-        auto ret = typed_storage.insert({ method, std::move(binary_file) });
-        if (!ret.second) {
-            throw std::runtime_error(std::string("Kernel type \"") + ccl_coll_type_to_str(type) +
-                                     "\", with \"" + ::to_string(method) +
-                                     "\" algo exist already. File: " + source_path +
-                                     " is not loaded!");
-        }
-    };
-
-    const modules_src_tuple& get_modules() const {
-        return storage;
-    }
-
-    template <ccl_coll_type type>
-    const source_data_storage_t& get_modules_collection() const noexcept {
-        static_assert(type < std::tuple_size<modules_src_tuple>::value,
-                      " Module of 'type' is not supported");
-        return std::get<type>(storage);
-    }
-
-private:
-    modules_src_container() = default;
-    modules_src_tuple storage;
-};
-} // namespace native
diff --git a/src/common/comm/l0/modules/modules_utils.hpp b/src/common/comm/l0/modules/modules_utils.hpp
deleted file mode 100644
index 29d507c2c..000000000
--- a/src/common/comm/l0/modules/modules_utils.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/modules/base_entry_module.hpp"
-#include "common/utils/tuple.hpp"
-#include "common/comm/l0/modules/kernel_utils.hpp"
-
-namespace native {
-namespace detail {
-
-template <ccl_coll_type type>
-struct kernel_entry_initializer {
-    using loader_t =
-        std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
-
-    kernel_entry_initializer(loader_t&& f) : functor(std::move(f)) {}
-
-    template <class typed_kernel>
-    void operator()(typed_kernel& kernel) {
-        kernel.handle = functor(get_kernel_name(kernel.name(), kernel.get_kernel_params()));
-    }
-
-private:
-    loader_t functor;
-};
-
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp b/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp
deleted file mode 100644
index 24b00a8c5..000000000
--- a/src/common/comm/l0/modules/ring/allgatherv_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/allgatherv_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_allgatherv,
-                                 ccl::device_topology_type::ring,
-                                 ring::allgatherv::main_kernel,
-                                 ring::allgatherv::numa_kernel,
-                                 ring::allgatherv::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_allgatherv,
-                                 ccl::device_topology_type::ring,
-                                 ring::allgatherv::ipc_kernel,
-                                 ring::allgatherv::ipc_kernel,
-                                 ring::allgatherv::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allgatherv,
-                                ccl::device_topology_type::ring,
-                                ring::allgatherv::main_kernel,
-                                ring::allgatherv::numa_kernel,
-                                ring::allgatherv::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp b/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp
deleted file mode 100644
index 23115a5f9..000000000
--- a/src/common/comm/l0/modules/ring/allgatherv_export_functions.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace allgatherv {
-
-/**
- * Common args for all kernel types
- */
-
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-template <class native_t>
-using send_buf_arg = arg<main_kernel_args::args_start_index + 3, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 4, native_t*>;
-
-template <class native_t>
-using right_output_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 5, native_t*>;
-
-using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 6, int*>;
-using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 7, int*>;
-using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-using right_income_data_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-
-using right_ready_to_recv_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel
-        : public execution_kernel<main_kernel,
-                                  send_buf_size_arg, // elems_count
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allgatherv_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg,
-                                  recv_elem_counts_buf_arg,
-                                  recv_elem_offsets_buf_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  right_output_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
-
-    using base::base;
-};
-
-// IMPORTANT: the params order is default, see *algatherv*.cl for that
-struct numa_kernel
-        : public execution_kernel<numa_kernel,
-                                  send_buf_size_arg, // elems_count
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allgatherv_execution_numa";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  recv_elem_counts_buf_arg,
-                                  recv_elem_offsets_buf_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  right_output_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           send_buf_size_arg, // elems_count
-                                           recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                           recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                           send_buf_arg<void>, // send_buf
-                                           recv_buf_arg<void>, // recv_buf (output_buffer)
-                                           right_output_buf_arg<void>, // right_output_buffer
-                                           income_data_flag_arg, // left_wrote_to_me_flag
-                                           ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                           right_income_data_flag_arg, // i_send_to_right_flag
-                                           right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "ring_allgatherv_ipc";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 send_buf_size_arg,
-                                 recv_elem_counts_buf_arg,
-                                 recv_elem_offsets_buf_arg,
-                                 send_buf_arg<processing_type>,
-                                 recv_buf_arg<processing_type>,
-                                 right_output_buf_arg<processing_type>,
-                                 income_data_flag_arg,
-                                 ready_to_recv_flag_arg,
-                                 right_income_data_flag_arg,
-                                 right_ready_to_recv_flag_arg>;
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<recv_buf_arg<processing_type>>(recv_buf);
-
-        auto income_data_flag =
-            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
-        this->template set_arg<income_data_flag_arg>(income_data_flag);
-
-        auto ready_to_recv_flag =
-            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
-        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel
-        : public execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg, // elems_count
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg> // right_ready_to_recv_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allgatherv_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  recv_elem_counts_buf_arg,
-                                  recv_elem_offsets_buf_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  right_output_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace allgatherv
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp b/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp
deleted file mode 100644
index 9eff3d8e5..000000000
--- a/src/common/comm/l0/modules/ring/allreduce_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/allreduce_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_allreduce,
-                                 ccl::device_topology_type::ring,
-                                 ring::allreduce::main_kernel,
-                                 ring::allreduce::numa_kernel,
-                                 ring::allreduce::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_allreduce,
-                                 ccl::device_topology_type::ring,
-                                 ring::allreduce::ipc_kernel,
-                                 ring::allreduce::ipc_kernel,
-                                 ring::allreduce::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_allreduce,
-                                ccl::device_topology_type::ring,
-                                ring::allreduce::main_kernel,
-                                ring::allreduce::numa_kernel,
-                                ring::allreduce::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
deleted file mode 100644
index 5bfcf24b9..000000000
--- a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace allreduce {
-
-/**
- * Common args for all kernel types
- */
-
-// own
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-template <class native_t>
-using send_buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 1, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 2, native_t*>;
-
-template <class native_t>
-using right_send_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 3, native_t*>;
-
-template <class native_t>
-using right_recv_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 4, native_t*>;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel : public execution_kernel<main_kernel,
-                                             send_buf_size_arg,
-                                             send_buf_arg<void>,
-                                             recv_buf_arg<void>,
-                                             right_send_buf_arg<void>,
-                                             right_recv_buf_arg<void>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  right_send_buf_arg<void>,
-                                  right_recv_buf_arg<void>>;
-
-    using base::base;
-};
-
-struct numa_kernel : public execution_kernel<numa_kernel,
-                                             send_buf_size_arg,
-                                             send_buf_arg<void>,
-                                             recv_buf_arg<void>,
-                                             right_send_buf_arg<void>,
-                                             right_recv_buf_arg<void>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution_numa";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  right_send_buf_arg<void>,
-                                  right_recv_buf_arg<void>>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           stub_arg<main_kernel_args::args_start_index>,
-                                           send_buf_arg<void>,
-                                           recv_buf_arg<void>,
-                                           stub_arg<main_kernel_args::args_start_index + 2>,
-                                           stub_arg<main_kernel_args::args_start_index + 3>> {
-    using processing_type = void;
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    static constexpr const char* specific_name() {
-        return "ring_allreduce_ipc";
-    }
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 stub_arg<main_kernel_args::args_start_index>,
-                                 send_buf_arg<processing_type>,
-                                 recv_buf_arg<processing_type>,
-                                 stub_arg<main_kernel_args::args_start_index + 2>,
-                                 stub_arg<main_kernel_args::args_start_index + 3>>;
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto send_buf = reinterpret_cast<typename send_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<send_buf_arg<processing_type>>(send_buf);
-
-        auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(1).get().pointer);
-        this->template set_arg<recv_buf_arg<processing_type>>(recv_buf);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel : public execution_kernel<scale_out_cpu_gw_kernel,
-                                                         send_buf_size_arg,
-                                                         send_buf_arg<void>,
-                                                         recv_buf_arg<void>,
-                                                         right_send_buf_arg<void>,
-                                                         right_recv_buf_arg<void>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "allreduce_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  right_send_buf_arg<processing_type>,
-                                  right_recv_buf_arg<processing_type>>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace allreduce
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp b/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp
deleted file mode 100644
index e03917339..000000000
--- a/src/common/comm/l0/modules/ring/alltoallv_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/alltoallv_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_alltoallv,
-                                 ccl::device_topology_type::ring,
-                                 ring::alltoallv::main_kernel,
-                                 ring::alltoallv::numa_kernel,
-                                 ring::alltoallv::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_alltoallv,
-                                 ccl::device_topology_type::ring,
-                                 ring::alltoallv::ipc_kernel,
-                                 ring::alltoallv::ipc_kernel,
-                                 ring::alltoallv::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_alltoallv,
-                                ccl::device_topology_type::ring,
-                                ring::alltoallv::main_kernel,
-                                ring::alltoallv::numa_kernel,
-                                ring::alltoallv::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp b/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp
deleted file mode 100644
index fe71d51a3..000000000
--- a/src/common/comm/l0/modules/ring/alltoallv_export_functions.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace alltoallv {
-
-/**
- * Common args for all kernel types
- */
-
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t*>;
-using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-using send_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 1, size_t*>;
-using send_elem_offsets_buf_arg_type = typename send_elem_offsets_buf_arg::arg_type;
-
-using recv_elem_counts_buf_arg = arg<main_kernel_args::args_start_index + 2, size_t*>;
-using recv_elem_counts_buf_arg_type = typename recv_elem_counts_buf_arg::arg_type;
-
-using recv_elem_offsets_buf_arg = arg<main_kernel_args::args_start_index + 3, size_t*>;
-using recv_elem_offsets_buf_arg_type = typename recv_elem_offsets_buf_arg::arg_type;
-
-template <class native_t>
-using send_buf_arg = arg<main_kernel_args::args_start_index + 4, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = arg<main_kernel_args::args_start_index + 5, native_t*>;
-
-template <class native_t>
-using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 6, native_t*>;
-
-template <class native_t>
-using right_tmp_recv_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
-
-using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 8, int*>;
-using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 9, int*>;
-using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-using proxy_size_flag_arg = external_arg<main_kernel_args::args_start_index + 10, int*>;
-using proxy_size_flag_arg_type = typename proxy_size_flag_arg::arg_type;
-
-using right_income_data_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 11, int*>;
-
-using right_ready_to_recv_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 12, int*>;
-
-using right_proxy_size_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 13, int*>;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel
-        : public execution_kernel<main_kernel,
-                                  send_buf_size_arg, // send_elem_counts
-                                  send_elem_offsets_buf_arg, // send_elem_offsets
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf
-                                  tmp_recv_buf_arg<void>, // tmp_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // proxy_size_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg> // right_proxy_size_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "alltoallv_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg, // 0 send_elem_counts
-                                  send_elem_offsets_buf_arg, // 1 send_elem_offsets
-                                  recv_elem_counts_buf_arg, // 2 recv_elem_counts
-                                  recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg<processing_type>, // 4 send_buf_arg
-                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
-                                  income_data_flag_arg, // 8 left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // 10 proxy_size_flag_arg
-                                  right_income_data_flag_arg, // 11 i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
-
-    using base::base;
-};
-
-// IMPORTANT: the params order is default, see *altoallv*.cl for that
-struct numa_kernel
-        : public execution_kernel<numa_kernel,
-                                  send_buf_size_arg, // send_elem_counts
-                                  send_elem_offsets_buf_arg, // send_elem_offsets
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf
-                                  tmp_recv_buf_arg<void>, // tmp_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // proxy_size_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg> // right_proxy_size_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "alltoallv_execution_numa";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg, // 0 send_elem_counts
-                                  send_elem_offsets_buf_arg, // 1 send_elem_offsets
-                                  recv_elem_counts_buf_arg, // 2 recv_elem_counts
-                                  recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg<processing_type>, // 4 send_buf_arg
-                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
-                                  income_data_flag_arg, // 8 left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // 10 proxy_size_flag_arg
-                                  right_income_data_flag_arg, // 11 i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           send_buf_size_arg, // send_elem_counts
-                                           send_elem_offsets_buf_arg, // send_elem_offsets
-                                           recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                           recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                           send_buf_arg<void>, // send_buf
-                                           recv_buf_arg<void>, // recv_buf
-                                           tmp_recv_buf_arg<void>, // tmp_buffer
-                                           right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                           income_data_flag_arg, // left_wrote_to_me_flag
-                                           ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                           proxy_size_flag_arg, // proxy_size_flag
-                                           right_income_data_flag_arg, // i_send_to_right_flag
-                                           right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-                                           right_proxy_size_flag_arg> // right_proxy_size_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "ring_alltoallv_ipc";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 send_buf_size_arg, // 0 send_elem_counts
-                                 send_elem_offsets_buf_arg, // 1 send_elem_offsets
-                                 recv_elem_counts_buf_arg, // 2 recv_elem_counts
-                                 recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                 send_buf_arg<processing_type>, // 4 send_buf_arg
-                                 recv_buf_arg<processing_type>, // 5 recv_buf_arg
-                                 tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
-                                 right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
-                                 income_data_flag_arg, // 8 left_wrote_to_me_flag
-                                 ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
-                                 proxy_size_flag_arg, // 10 proxy_size_flag_arg
-                                 right_income_data_flag_arg, // 11 i_send_to_right_flag
-                                 right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
-                                 right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
-
-        auto income_data_flag =
-            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
-        this->template set_arg<income_data_flag_arg>(income_data_flag);
-
-        auto ready_to_recv_flag =
-            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
-        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
-
-        auto proxy_size_flag =
-            reinterpret_cast<proxy_size_flag_arg_type>(ipc_handles.at(3).get().pointer);
-        this->template set_arg<proxy_size_flag_arg>(proxy_size_flag);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel
-        : public execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg, // send_elem_counts
-                                  send_elem_offsets_buf_arg, // send_elem_offsets
-                                  recv_elem_counts_buf_arg, // recv_elem_counts_buf
-                                  recv_elem_offsets_buf_arg, // recv_elem_offsets_buf
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf
-                                  tmp_recv_buf_arg<void>, // tmp_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // proxy_size_flag
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg> // right_proxy_size_flag
-{
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "alltoallv_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg, // 0 send_elem_counts
-                                  send_elem_offsets_buf_arg, // 1 send_elem_offsets
-                                  recv_elem_counts_buf_arg, // 2 recv_elem_counts
-                                  recv_elem_offsets_buf_arg, // 3 recv_elem_offsets
-                                  send_buf_arg<processing_type>, // 4 send_buf_arg
-                                  recv_buf_arg<processing_type>, // 5 recv_buf_arg
-                                  tmp_recv_buf_arg<processing_type>, // 6 tmp_buffer
-                                  right_tmp_recv_buf_arg<processing_type>, // 7 right_temp_buffer
-                                  income_data_flag_arg, // 8 left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // 9 i_ready_to_receive_flag
-                                  proxy_size_flag_arg, // 10 proxy_size_flag_arg
-                                  right_income_data_flag_arg, // 11 i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // 12 right_ready_to_recv_flag
-                                  right_proxy_size_flag_arg>; // 13 right_proxy_size_flag
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace alltoallv
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp b/src/common/comm/l0/modules/ring/bcast_entry_module.hpp
deleted file mode 100644
index c308d25db..000000000
--- a/src/common/comm/l0/modules/ring/bcast_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/bcast_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_bcast,
-                                 ccl::device_topology_type::ring,
-                                 ring::bcast::main_kernel,
-                                 ring::bcast::numa_kernel,
-                                 ring::bcast::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_bcast,
-                                 ccl::device_topology_type::ring,
-                                 ring::bcast::ipc_kernel,
-                                 ring::bcast::ipc_kernel,
-                                 ring::bcast::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_bcast,
-                                ccl::device_topology_type::ring,
-                                ring::bcast::main_kernel,
-                                ring::bcast::numa_kernel,
-                                ring::bcast::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp b/src/common/comm/l0/modules/ring/bcast_export_functions.hpp
deleted file mode 100644
index 67bd72301..000000000
--- a/src/common/comm/l0/modules/ring/bcast_export_functions.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace bcast {
-
-/**
- * Common args for all kernel types
- */
-
-template <class native_t>
-using buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index, native_t*>;
-
-template <class native_t>
-using right_buf_arg = thread_exchangable_arg<main_kernel_args::args_start_index + 1, native_t*>;
-
-using root_arg = arg<main_kernel_args::args_start_index + 2, size_t>;
-using root_arg_type = typename root_arg::arg_type;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel
-        : public execution_kernel<main_kernel, buf_arg<void>, right_buf_arg<void>, root_arg> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "bcast_execution";
-    }
-
-    using common_entry_buf_arg = buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  buf_arg<processing_type>,
-                                  right_buf_arg<processing_type>,
-                                  root_arg>;
-
-    using base::base;
-};
-
-struct numa_kernel
-        : public execution_kernel<numa_kernel, buf_arg<void>, right_buf_arg<void>, root_arg> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "bcast_execution_numa";
-    }
-
-    using common_entry_buf_arg = buf_arg<processing_type>;
-
-    using base = execution_kernel<numa_kernel,
-                                  buf_arg<processing_type>,
-                                  right_buf_arg<processing_type>,
-                                  root_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           buf_arg<void>,
-                                           stub_arg<main_kernel_args::args_start_index + 1>,
-                                           stub_arg<main_kernel_args::args_start_index + 2>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "ring_bcast_ipc";
-    }
-
-    using common_entry_buf_arg = buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 buf_arg<processing_type>,
-                                 stub_arg<main_kernel_args::args_start_index + 1>,
-                                 stub_arg<main_kernel_args::args_start_index + 2>>;
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto recv_buf = reinterpret_cast<typename buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<buf_arg<processing_type>>(recv_buf);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel : public execution_kernel<scale_out_cpu_gw_kernel,
-                                                         buf_arg<void>,
-                                                         right_buf_arg<void>,
-                                                         root_arg> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "bcast_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_arg = buf_arg<processing_type>;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  buf_arg<processing_type>,
-                                  right_buf_arg<processing_type>,
-                                  root_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace bcast
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_entry_module.hpp
deleted file mode 100644
index 9f3241e33..000000000
--- a/src/common/comm/l0/modules/ring/reduce_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/reduce_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_reduce,
-                                 ccl::device_topology_type::ring,
-                                 ring::reduce::main_kernel,
-                                 ring::reduce::numa_kernel,
-                                 ring::reduce::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_reduce,
-                                 ccl::device_topology_type::ring,
-                                 ring::reduce::ipc_kernel,
-                                 ring::reduce::ipc_kernel,
-                                 ring::reduce::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce,
-                                ccl::device_topology_type::ring,
-                                ring::reduce::main_kernel,
-                                ring::reduce::numa_kernel,
-                                ring::reduce::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_export_functions.hpp
deleted file mode 100644
index a07e27087..000000000
--- a/src/common/comm/l0/modules/ring/reduce_export_functions.hpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace reduce {
-
-/**
- * Common args for all kernel types
- */
-
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-using send_buf_size_arg_type = typename send_buf_size_arg::arg_type;
-
-template <class native_t>
-using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = arg<main_kernel_args::args_start_index + 2, native_t*>;
-
-template <class native_t>
-using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
-
-using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-template <class native_t>
-using right_tmp_recv_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
-
-using right_income_data_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 8, int*>;
-using right_income_data_flag_arg_type = typename right_income_data_flag_arg::arg_type;
-
-using right_ready_to_recv_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-using right_ready_to_recv_flag_arg_type = typename right_ready_to_recv_flag_arg::arg_type;
-
-using root_arg = arg<main_kernel_args::args_start_index + 10, size_t>;
-using root_arg_type = typename root_arg::arg_type;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel : public execution_kernel<main_kernel,
-                                             send_buf_size_arg,
-                                             send_buf_arg<void>,
-                                             recv_buf_arg<void>,
-                                             tmp_recv_buf_arg<void>,
-                                             income_data_flag_arg,
-                                             ready_to_recv_flag_arg,
-                                             local_barrier_flag_arg,
-                                             right_tmp_recv_buf_arg<void>,
-                                             right_income_data_flag_arg,
-                                             right_ready_to_recv_flag_arg,
-                                             root_arg> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  root_arg>;
-
-    using base::base;
-};
-
-struct numa_kernel
-        : public execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  tmp_recv_buf_arg<void>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<void>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  root_arg,
-
-                                  // numa-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 11, void*>,
-                                  permanent_arg<main_kernel_args::args_start_index + 12, int*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_execution_numa";
-    }
-
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, void*>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  root_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           stub_arg<main_kernel_args::args_start_index>,
-                                           stub_arg<main_kernel_args::args_start_index + 1>,
-                                           stub_arg<main_kernel_args::args_start_index + 2>,
-                                           tmp_recv_buf_arg<void>,
-                                           income_data_flag_arg,
-                                           ready_to_recv_flag_arg,
-                                           stub_arg<main_kernel_args::args_start_index + 6>,
-                                           stub_arg<main_kernel_args::args_start_index + 7>,
-                                           stub_arg<main_kernel_args::args_start_index + 8>,
-                                           stub_arg<main_kernel_args::args_start_index + 9>,
-                                           stub_arg<main_kernel_args::args_start_index + 10>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "ring_reduce_ipc";
-    }
-
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 stub_arg<main_kernel_args::args_start_index>,
-                                 stub_arg<main_kernel_args::args_start_index + 1>,
-                                 stub_arg<main_kernel_args::args_start_index + 2>,
-                                 tmp_recv_buf_arg<processing_type>,
-                                 income_data_flag_arg,
-                                 ready_to_recv_flag_arg,
-                                 stub_arg<main_kernel_args::args_start_index + 6>,
-                                 stub_arg<main_kernel_args::args_start_index + 7>,
-                                 stub_arg<main_kernel_args::args_start_index + 8>,
-                                 stub_arg<main_kernel_args::args_start_index + 9>,
-                                 stub_arg<main_kernel_args::args_start_index + 10>>;
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
-
-        auto income_data_flag =
-            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(1).get().pointer);
-        this->template set_arg<income_data_flag_arg>(income_data_flag);
-
-        auto ready_to_recv_flag =
-            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(2).get().pointer);
-        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel
-        : public execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<void>,
-                                  recv_buf_arg<void>,
-                                  tmp_recv_buf_arg<void>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<void>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  root_arg,
-
-                                  // numa-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 11, void*>,
-                                  permanent_arg<main_kernel_args::args_start_index + 12, int*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg =
-        permanent_arg<main_kernel_args::args_start_index + 11, processing_type*>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, int*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  root_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace reduce
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp
deleted file mode 100644
index 44ae2a55a..000000000
--- a/src/common/comm/l0/modules/ring/reduce_scatter_entry_module.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp"
-#include "common/comm/l0/modules/gpu_typed_module.hpp"
-
-namespace native {
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(device_coll_module,
-                                 real_gpu_typed_module,
-                                 ccl_coll_reduce_scatter,
-                                 ccl::device_topology_type::ring,
-                                 ring::reduce_scatter::main_kernel,
-                                 ring::reduce_scatter::numa_kernel,
-                                 ring::reduce_scatter::scale_out_cpu_gw_kernel);
-
-DEFINE_SPECIFIC_GPU_MODULE_CLASS(ipc_dst_device_coll_module,
-                                 ipc_gpu_typed_module,
-                                 ccl_coll_reduce_scatter,
-                                 ccl::device_topology_type::ring,
-                                 ring::reduce_scatter::ipc_kernel,
-                                 ring::reduce_scatter::ipc_kernel,
-                                 ring::reduce_scatter::ipc_kernel);
-
-DEFINE_VIRTUAL_GPU_MODULE_CLASS(ccl_coll_reduce_scatter,
-                                ccl::device_topology_type::ring,
-                                ring::reduce_scatter::main_kernel,
-                                ring::reduce_scatter::numa_kernel,
-                                ring::reduce_scatter::scale_out_cpu_gw_kernel);
-} // namespace native
diff --git a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp b/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp
deleted file mode 100644
index f1f3789ff..000000000
--- a/src/common/comm/l0/modules/ring/reduce_scatter_export_functions.hpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/kernel_functions.hpp"
-
-namespace native {
-
-namespace ring {
-
-namespace reduce_scatter {
-
-/**
- * Common args for all kernel types
- */
-
-using send_buf_size_arg = arg<main_kernel_args::args_start_index, size_t>;
-
-// TODO: since we use only a single type, remove template parameter here
-template <class native_t>
-using send_buf_arg = arg<main_kernel_args::args_start_index + 1, native_t*>;
-
-template <class native_t>
-using recv_buf_arg = external_arg<main_kernel_args::args_start_index + 2, native_t*>;
-
-template <class native_t>
-using tmp_recv_buf_arg = external_arg<main_kernel_args::args_start_index + 3, native_t*>;
-
-using income_data_flag_arg = external_arg<main_kernel_args::args_start_index + 4, int*>;
-using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
-
-using ready_to_recv_flag_arg = external_arg<main_kernel_args::args_start_index + 5, int*>;
-using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
-
-using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
-using local_barrier_flag_arg_type = typename local_barrier_flag_arg::arg_type;
-
-template <class native_t>
-using right_output_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 7, native_t*>;
-
-template <class native_t>
-using right_tmp_recv_buf_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 8, native_t*>;
-
-using right_income_data_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 9, int*>;
-
-using right_ready_to_recv_flag_arg =
-    thread_exchangable_arg<main_kernel_args::args_start_index + 10, int*>;
-
-// IMPORTANT: the number and types of arguments must be the same in all classes,
-// excluding arguments specific for numa/scaleout etc.
-struct main_kernel
-        : public execution_kernel<main_kernel,
-                                  send_buf_size_arg, // recv_count
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  tmp_recv_buf_arg<void>, // tmp_buf
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  local_barrier_flag_arg, // local_barrier_flag
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg> { // right_ready_to_recv_flag
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_scatter_execution";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = execution_kernel<main_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_output_buf_arg<processing_type>,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg>;
-
-    using base::base;
-};
-
-struct numa_kernel
-        : public execution_kernel<numa_kernel,
-                                  send_buf_size_arg, // recv_count
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  tmp_recv_buf_arg<void>, // tmp_buf
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  local_barrier_flag_arg, // local_barrier_flag
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-
-                                  // numa-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 11, size_t>,
-                                  permanent_arg<main_kernel_args::args_start_index + 12, void*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_scatter_execution_numa";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<numa_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_output_buf_arg<processing_type>,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-struct ipc_kernel : public base_ipc_kernel<ipc_kernel,
-                                           stub_arg<main_kernel_args::args_start_index>,
-                                           stub_arg<main_kernel_args::args_start_index + 1>,
-                                           recv_buf_arg<void>, // recv_buf (output_buffer)
-                                           tmp_recv_buf_arg<void>, // tmp_buf
-                                           income_data_flag_arg, // left_wrote_to_me_flag
-                                           ready_to_recv_flag_arg,
-                                           stub_arg<main_kernel_args::args_start_index + 6>,
-                                           stub_arg<main_kernel_args::args_start_index + 7>,
-                                           stub_arg<main_kernel_args::args_start_index + 8>,
-                                           stub_arg<main_kernel_args::args_start_index + 9>,
-                                           stub_arg<main_kernel_args::args_start_index + 10>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "ring_reduce_scatter_ipc";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    using base = base_ipc_kernel<ipc_kernel,
-                                 stub_arg<main_kernel_args::args_start_index>,
-                                 stub_arg<main_kernel_args::args_start_index + 1>,
-                                 recv_buf_arg<processing_type>,
-                                 tmp_recv_buf_arg<processing_type>,
-                                 income_data_flag_arg,
-                                 ready_to_recv_flag_arg,
-                                 stub_arg<main_kernel_args::args_start_index + 6>,
-                                 stub_arg<main_kernel_args::args_start_index + 7>,
-                                 stub_arg<main_kernel_args::args_start_index + 8>,
-                                 stub_arg<main_kernel_args::args_start_index + 9>,
-                                 stub_arg<main_kernel_args::args_start_index + 10>>;
-
-    template <class ipc_handles_t>
-    void bind_data(const ipc_handles_t& ipc_handles) {
-        auto recv_buf = reinterpret_cast<typename recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(0).get().pointer);
-        this->template set_arg<recv_buf_arg<processing_type>>(recv_buf);
-
-        auto tmp_recv_buf = reinterpret_cast<typename tmp_recv_buf_arg<processing_type>::arg_type>(
-            ipc_handles.at(1).get().pointer);
-        this->template set_arg<tmp_recv_buf_arg<processing_type>>(tmp_recv_buf);
-
-        auto income_data_flag =
-            reinterpret_cast<income_data_flag_arg_type>(ipc_handles.at(2).get().pointer);
-        this->template set_arg<income_data_flag_arg>(income_data_flag);
-
-        auto ready_to_recv_flag =
-            reinterpret_cast<ready_to_recv_flag_arg_type>(ipc_handles.at(3).get().pointer);
-        this->template set_arg<ready_to_recv_flag_arg>(ready_to_recv_flag);
-    }
-
-    using base::base;
-};
-
-struct scale_out_cpu_gw_kernel
-        : public execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg, // recv_count
-                                  send_buf_arg<void>, // send_buf
-                                  recv_buf_arg<void>, // recv_buf (output_buffer)
-                                  tmp_recv_buf_arg<void>, // tmp_buf
-                                  income_data_flag_arg, // left_wrote_to_me_flag
-                                  ready_to_recv_flag_arg, // i_ready_to_receive_flag
-                                  local_barrier_flag_arg, // local_barrier_flag
-                                  right_output_buf_arg<void>, // right_output_buffer
-                                  right_tmp_recv_buf_arg<void>, // right_temp_buffer
-                                  right_income_data_flag_arg, // i_send_to_right_flag
-                                  right_ready_to_recv_flag_arg, // right_ready_to_recv_flag
-
-                                  // scaleout-specific args
-                                  permanent_arg<main_kernel_args::args_start_index + 11, size_t>,
-                                  permanent_arg<main_kernel_args::args_start_index + 12, void*>> {
-    using processing_type = void;
-
-    static constexpr const char* specific_name() {
-        return "reduce_scatter_execution_scale_out_cpu_gw";
-    }
-
-    using common_entry_buf_size_arg = send_buf_size_arg;
-    using common_entry_buf_arg = send_buf_arg<processing_type>;
-
-    // event data
-    using event_prod_chunk_mem_arg = permanent_arg<main_kernel_args::args_start_index + 11, size_t>;
-    using event_prod_chunk_mem_arg_type = typename event_prod_chunk_mem_arg::arg_type;
-
-    using event_prod_bytes_arg = permanent_arg<main_kernel_args::args_start_index + 12, void*>;
-    using event_prod_bytes_arg_type = typename event_prod_bytes_arg::arg_type;
-
-    using base = execution_kernel<scale_out_cpu_gw_kernel,
-                                  send_buf_size_arg,
-                                  send_buf_arg<processing_type>,
-                                  recv_buf_arg<processing_type>,
-                                  tmp_recv_buf_arg<processing_type>,
-                                  income_data_flag_arg,
-                                  ready_to_recv_flag_arg,
-                                  local_barrier_flag_arg,
-                                  right_output_buf_arg<processing_type>,
-                                  right_tmp_recv_buf_arg<processing_type>,
-                                  right_income_data_flag_arg,
-                                  right_ready_to_recv_flag_arg,
-                                  event_prod_chunk_mem_arg,
-                                  event_prod_bytes_arg>;
-
-    template <class ctx_params_t>
-    void bind_data(const ctx_params_t& out_ctx_params) {
-        // TODO not implemented
-        (void)out_ctx_params;
-        throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for that kernel type");
-    }
-
-    using base::base;
-};
-
-} // namespace reduce_scatter
-} // namespace ring
-} // namespace native
diff --git a/src/common/comm/l0/modules/specific_modules_source_data.hpp b/src/common/comm/l0/modules/specific_modules_source_data.hpp
deleted file mode 100644
index 34739ab13..000000000
--- a/src/common/comm/l0/modules/specific_modules_source_data.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "coll/coll.hpp"
-#include "common/comm/l0/modules/modules_source_data.hpp"
-namespace native {
-
-using specific_modules_source_data_storage = modules_src_container<CCL_COLL_TYPE_LIST>;
-}
diff --git a/src/common/comm/l0/modules/supported_modules.hpp b/src/common/comm/l0/modules/supported_modules.hpp
deleted file mode 100644
index 6c6b0348b..000000000
--- a/src/common/comm/l0/modules/supported_modules.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/modules/base_entry_module.hpp"
-
-namespace native {
-// alias for topologies
-template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
-          class module_impl,
-          ccl_coll_type type,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type... class_ids>
-using topology_device_classes_modules_for_group_id =
-    std::tuple<std::shared_ptr<module_impl<type, group_id, class_ids>>...>;
-
-template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
-          class module_impl,
-          ccl_coll_type type,
-          ccl::group_split_type... top_types>
-using topology_device_group_modules = std::tuple<
-    topology_device_classes_modules_for_group_id<module_impl,
-                                                 type,
-                                                 top_types,
-                                                 SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>...>;
-
-// alias for coll types
-template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
-          class module_impl,
-          ccl_coll_type... types>
-using supported_topology_device_modules = std::tuple<
-    topology_device_group_modules<module_impl, types, SUPPORTED_HW_TOPOLOGIES_DECL_LIST>...>;
-
-// alias for implemented modules
-template <template <ccl_coll_type, ccl::group_split_type, ccl::device_topology_type>
-          class module_impl>
-using supported_device_modules = supported_topology_device_modules<module_impl, CCL_COLL_TYPE_LIST>;
-
-using supported_device_modules1 = std::array<int, 1>;
-} // namespace native
diff --git a/src/common/comm/l0/modules_connector.hpp b/src/common/comm/l0/modules_connector.hpp
deleted file mode 100644
index 4f1199b9b..000000000
--- a/src/common/comm/l0/modules_connector.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/base_connector.hpp"
-#include "common/utils/tuple.hpp"
-
-template <class managed_kernel, class entry, class... binded_kernels>
-struct kernel_connector : public base_connector_interface<managed_kernel> {
-    using base = base_connector_interface<managed_kernel>;
-    using binding_type = std::tuple<std::reference_wrapper<binded_kernels>...>;
-    kernel_connector(entry& e, binded_kernels&... args)
-            : base(),
-              executor(e),
-              deferred_kernels(args...) {}
-
-    bool operator()(managed_kernel& kernel_to_connect) override {
-        return connect_impl(
-            kernel_to_connect,
-            typename sequence_generator<std::tuple_size<binding_type>::value>::type());
-    }
-
-private:
-    template <int... connected_arguments_indices>
-    bool connect_impl(managed_kernel& kernel_to_connect,
-                      numeric_sequence<connected_arguments_indices...>) {
-        return executor.execute(kernel_to_connect,
-                                std::get<connected_arguments_indices>(deferred_kernels).get()...);
-    }
-
-    entry& executor;
-    binding_type deferred_kernels;
-};
diff --git a/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp b/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp
deleted file mode 100644
index f16c9cb8b..000000000
--- a/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-#pragma once
-#include "common/utils/spinlock.hpp"
-#include "sched/gpu_concurrent_sched.hpp"
-// #include "sched/entry/l0/l0_allreduce_typed_entry.hpp"
-// #include "sched/entry/l0/l0_allgather_handles_entry.hpp"
-#include "sched/entry/factory/entry_factory.hpp"
-#include "common/comm/l0/device_community.hpp"
-#include "common/comm/l0/scheduler/thread_group_scheduler.hpp"
-
-namespace native {
-
-//template<ccl_coll_type entry_type>
-struct allied_process_group_scheduler : public thread_group_scheduler {
-    using base = thread_group_scheduler;
-    using thread_group_scheduler::schedule_ptr;
-    using thread_group_scheduler::thread_schedule_ptr;
-    //create schedule
-    /*
-     static constexpr ccl_coll_type type() noexcept
-    {
-        return entry_type;
-    }
-  */
-    allied_process_group_scheduler(size_t process_count,
-                                   size_t threads_count,
-                                   std::shared_ptr<ccl::host_communicator> communicator,
-                                   device_storage& node_devices)
-            : base(threads_count),
-              ccl_communicator(communicator) /*,
-              node_total_devices(node_devices)*/
-    {}
-
-    template <class EntryType,
-              ccl_sched_add_mode mode,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t,
-              class... Arguments>
-    thread_schedule_ptr submit_entry(size_t process_id,
-                                     size_t thread_id,
-                                     device_community<class_id>& device_topology,
-                                     device_t& device,
-                                     native::ccl_driver_context_ptr ctx,
-                                     Arguments&&... args) {
-        const topology_addr<group_id, class_id>& comm_data =
-            device->template get_comm_data<group_id, class_id>();
-
-        size_t device_group_size =
-            device_topology.template get_device_count<native::ccl_gpu_comm>() +
-            device_topology.template get_device_count<native::ccl_virtual_gpu_comm>() +
-            device_topology
-                .template get_device_count<native::ccl_ipc_source_gpu_comm<ccl_gpu_comm>>() +
-            device_topology
-                .template get_device_count<native::ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>();
-
-        LOG_DEBUG("Process id: ",
-                  process_id,
-                  ", Thread id: ",
-                  thread_id,
-                  " device_group_size: ",
-                  device_group_size,
-                  " comm data: ",
-                  comm_data.to_string());
-
-        //get thread local schedule
-        auto current_thread_schedule = current_schedule->get_gpu_sched(thread_id);
-        if (!current_thread_schedule) {
-            current_thread_schedule = current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-        }
-
-        // create entry
-        auto created_entry =
-            entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
-                                                               device,
-                                                               device_topology.get_device_storage(),
-                                                               ctx,
-                                                               std::forward<Arguments>(args)...);
-        LOG_DEBUG("do initial entry progress");
-        created_entry->start();
-        current_thread_schedule->set_fence(created_entry->get_fence()); //TODO temporary
-        current_thread_schedule->do_progress();
-
-        LOG_DEBUG("Device group filled for: ",
-                  current_thread_schedule->entries_count(),
-                  "/",
-                  device_group_size);
-        if (current_thread_schedule->entries_count() == device_group_size) {
-            LOG_DEBUG("Device group finalized");
-            current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-            ;
-            return current_thread_schedule;
-        }
-        //if sched is not ready - send NULL
-        return thread_schedule_ptr();
-    }
-
-    template <class EntryType,
-              ccl_sched_add_mode mode,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t,
-              class... Arguments>
-    thread_schedule_ptr submit_entry_ipc(size_t process_id,
-                                         size_t thread_id,
-                                         device_community<class_id>& device_topology,
-                                         device_t& device,
-                                         native::ccl_driver_context_ptr ctx,
-                                         Arguments&&... args) {
-        const topology_addr<group_id, class_id>& comm_data =
-            device->template get_comm_data<group_id, class_id>();
-
-        size_t ipc_source_count =
-            device_topology
-                .template get_device_count<native::ccl_ipc_source_gpu_comm<ccl_gpu_comm>>() +
-            device_topology
-                .template get_device_count<native::ccl_ipc_source_gpu_comm<ccl_virtual_gpu_comm>>();
-
-        size_t device_group_size =
-            device_topology.template get_device_count<native::ccl_gpu_comm>() +
-            device_topology.template get_device_count<native::ccl_virtual_gpu_comm>() +
-            ipc_source_count;
-
-        LOG_DEBUG("Process id: ",
-                  process_id,
-                  ", Thread id: ",
-                  thread_id,
-                  " device_group_size: ",
-                  device_group_size,
-                  " comm data: ",
-                  comm_data.to_string());
-
-        //get thread local schedule
-        auto current_thread_schedule = current_schedule->get_gpu_sched(thread_id);
-        if (!current_thread_schedule) {
-            current_thread_schedule = current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-        }
-
-        return thread_schedule_ptr();
-
-        // create entry
-        /*auto created_entry =
-            entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
-                                                               device,
-                                                               device_topology.get_device_storage(),
-                                                               ctx,
-                                                               std::forward<Arguments>(args)...);
-
-        auto ipc_allgather_entry =
-            entry_factory::make_ordered_entry<l0_allgather_handles_entry<EntryType>,
-                                              ccl_sched_add_front>(current_thread_schedule.get(),
-                                                                   device,
-                                                                   ccl_communicator,
-                                                                   node_total_devices,
-                                                                   ctx,
-                                                                   created_entry->get_ipc_data());
-
-        LOG_DEBUG("do initial entry progress");
-        ipc_allgather_entry->start();
-        created_entry->start();
-        current_thread_schedule->set_fence(created_entry->get_fence()); //TODO temporary
-        current_thread_schedule->do_progress();
-
-        LOG_DEBUG("Device group filled for: ",
-                  current_thread_schedule->entries_count(),
-                  "/",
-                  device_group_size,
-                  ", ipc sources: ",
-                  ipc_source_count);
-        if (current_thread_schedule->entries_count() == device_group_size + 1) {
-            LOG_DEBUG("Device group finalized with IPC source count: ", ipc_source_count);
-            current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-            ;
-            return current_thread_schedule;
-        }
-        //if sched is not ready - send NULL
-        return thread_schedule_ptr();
-        auto req = submit_entry<EntryType, mode, group_id, class_id>(
-            process_id, thread_id, device_topology, device, ctx, std::forward<Arguments>(args)...);
-        return req;
-        */
-    }
-
-private:
-    std::shared_ptr<ccl::host_communicator> ccl_communicator;
-    // device_storage& node_total_devices;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/scheduler/device_group_scheduler.hpp b/src/common/comm/l0/scheduler/device_group_scheduler.hpp
deleted file mode 100644
index 3a46fd21a..000000000
--- a/src/common/comm/l0/scheduler/device_group_scheduler.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "sched/gpu_sched.hpp"
-// #include "sched/entry/l0/l0_allreduce_typed_entry.hpp"
-//#include "sched/entry/l0/l0_allgather_handles_entry.hpp"
-#include "sched/entry/factory/entry_factory.hpp"
-
-#include "common/comm/l0/device_community.hpp"
-namespace native {
-
-//template<ccl_coll_type entry_type>
-struct device_group_scheduler {
-    using schedule_ptr = std::unique_ptr<ccl_gpu_sched>;
-    //create schedule
-    /*
-     static constexpr ccl_coll_type type() noexcept
-    {
-        return entry_type;
-    }
-  */
-    template <class EntryType,
-              ccl_sched_add_mode mode,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t,
-              class... Arguments>
-    schedule_ptr submit_entry(device_community<class_id>& device_topology,
-                              device_t& device,
-                              native::ccl_driver_context_ptr ctx,
-                              Arguments&&... args) {
-        //create schedule
-        size_t group_size =
-            device_topology.template get_device_count<native::ccl_gpu_comm>() +
-            device_topology.template get_device_count<native::ccl_virtual_gpu_comm>();
-
-        //make entry
-        if (!current_schedule) {
-            current_schedule.reset(
-                new ccl_gpu_sched(device_topology.get_device_storage(), group_size));
-        }
-
-        auto created_entry =
-            entry_factory::make_ordered_entry<EntryType, mode>(current_schedule.get(),
-                                                               device,
-                                                               device_topology.get_device_storage(),
-                                                               ctx,
-                                                               std::forward<Arguments>(args)...);
-        LOG_DEBUG("do initial progress");
-
-        created_entry->start();
-        current_schedule->set_fence(created_entry->get_fence()); //TODO temporary
-
-        //active_group_sched->add_entry(std::move(created_entry));
-        current_schedule->do_progress();
-
-        LOG_DEBUG("Device group filled for: ", current_schedule->entries_count(), "/", group_size);
-        if (current_schedule->entries_count() == group_size) {
-            LOG_DEBUG("Device group finalized");
-            schedule_ptr ret(new ccl_gpu_sched(device_topology.get_device_storage(), group_size));
-            ret.swap(current_schedule);
-
-            return ret;
-        }
-        //if sched is not ready - send NULL
-        return std::unique_ptr<ccl_gpu_sched>();
-    }
-
-private:
-    schedule_ptr current_schedule;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/scheduler/thread_group_scheduler.hpp b/src/common/comm/l0/scheduler/thread_group_scheduler.hpp
deleted file mode 100644
index 37c8b3870..000000000
--- a/src/common/comm/l0/scheduler/thread_group_scheduler.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/utils/spinlock.hpp"
-#include "sched/gpu_concurrent_sched.hpp"
-// #include "sched/entry/l0/l0_allreduce_typed_entry.hpp"
-// #include "sched/entry/l0/l0_allgatherv_typed_entry.hpp"
-// #include "sched/entry/l0/l0_alltoallv_typed_entry.hpp"
-// #include "sched/entry/l0/l0_bcast_typed_entry.hpp"
-// #include "sched/entry/l0/l0_reduce_typed_entry.hpp"
-// #include "sched/entry/l0/l0_reduce_scatter_typed_entry.hpp"
-// #include "sched/entry/l0/l0_allgatherv_typed_entry.hpp"
-//#include "sched/entry/l0/l0_allgather_handles_entry.hpp"
-#include "sched/entry/factory/entry_factory.hpp"
-#include "common/comm/l0/device_community.hpp"
-
-namespace native {
-
-//template<ccl_coll_type entry_type>
-struct thread_group_scheduler {
-    using schedule_ptr = std::unique_ptr<ccl_gpu_concurrent_sched>;
-    using thread_schedule_ptr = std::shared_ptr<ccl_gpu_sched>;
-    //create schedule
-    /*
-     static constexpr ccl_coll_type type() noexcept
-    {
-        return entry_type;
-    }
-  */
-    thread_group_scheduler(size_t threads_count) : thread_group_size(threads_count) {
-        //make concurrent chedule
-        if (!current_schedule) {
-            current_schedule = ccl_gpu_concurrent_sched::create(thread_group_size);
-        }
-    }
-
-    template <class EntryType,
-              ccl_sched_add_mode mode,
-              ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t,
-              class... Arguments>
-    thread_schedule_ptr submit_entry(size_t thread_id,
-                                     device_community<class_id>& device_topology,
-                                     device_t& device,
-                                     native::ccl_driver_context_ptr ctx,
-                                     Arguments&&... args) {
-        const topology_addr<group_id, class_id>& comm_data =
-            device->template get_comm_data<group_id, class_id>();
-        size_t device_group_size =
-            device_topology.template get_device_count<native::ccl_gpu_comm>() +
-            device_topology.template get_device_count<native::ccl_virtual_gpu_comm>();
-
-        LOG_DEBUG("Thread id: ",
-                  thread_id,
-                  " device_group_size: ",
-                  device_group_size,
-                  " comm data: ",
-                  comm_data.to_string());
-        //get thread local schedule
-        auto current_thread_schedule = current_schedule->get_gpu_sched(thread_id);
-        if (!current_thread_schedule) {
-            current_thread_schedule = current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-        }
-
-        // create entry
-        auto created_entry =
-            entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
-                                                               device,
-                                                               device_topology.get_device_storage(),
-                                                               ctx,
-                                                               std::forward<Arguments>(args)...);
-        LOG_DEBUG("do initial entry progress");
-        created_entry->start();
-        current_thread_schedule->set_fence(created_entry->get_fence()); //TODO temporary
-        current_thread_schedule->do_progress();
-
-        LOG_DEBUG("Device group filled for: ",
-                  current_thread_schedule->entries_count(),
-                  "/",
-                  device_group_size);
-        if (current_thread_schedule->entries_count() == device_group_size) {
-            LOG_DEBUG("Device group finalized");
-            current_schedule->create_gpu_sched(
-                thread_id, device_topology.get_device_storage(), comm_data.size);
-            ;
-            return current_thread_schedule;
-        }
-        //if sched is not ready - send NULL
-        return thread_schedule_ptr();
-    }
-
-protected:
-    schedule_ptr current_schedule;
-    size_t thread_group_size;
-};
-
-} // namespace native
diff --git a/src/common/comm/l0/topology/cluster_device_utils.hpp b/src/common/comm/l0/topology/cluster_device_utils.hpp
deleted file mode 100644
index edb16fab5..000000000
--- a/src/common/comm/l0/topology/cluster_device_utils.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-namespace native {
-namespace detail {
-namespace cluster_utils {
-
-inline global_sorted_plain_graphs extract_full_node_plain_graphs(
-    std::ostream& out,
-    const ccl::cluster_device_indices_type& cluster_indices,
-    const std::string& hostname,
-    const detail::global_sorted_plain_graphs& cluster_graphs) {
-    detail::global_sorted_plain_graphs ret;
-
-    out << "Find host: " << hostname << " in cluster size: " << cluster_indices.size() << std::endl;
-    auto node_it = cluster_indices.find(hostname);
-    if (node_it == cluster_indices.end()) {
-        out << "Cannot find node with: " << hostname << std::endl;
-        return ret;
-    }
-
-    //iterate over all allied processes on the same host
-    const ccl::process_device_indices_type& processes = node_it->second;
-    out << "Find processes count: " << processes.size() << " on node: " << hostname << std::endl;
-    for (const auto& process_val : processes) {
-        auto process_id = process_val.first;
-        auto process_graph_list_it = cluster_graphs.find(process_id);
-        if (process_graph_list_it == cluster_graphs.end()) {
-            out << "There are cluster topology for process: " << process_id << std::endl;
-            std::stringstream ss;
-            ss << out.rdbuf();
-            throw std::runtime_error(std::string(__FUNCTION__) + " - log:\n" + ss.str());
-        }
-
-        // remember allied process and it topology
-        ret.insert({ process_id, process_graph_list_it->second });
-    }
-
-    return ret;
-}
-} // namespace cluster_utils
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp b/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp
deleted file mode 100644
index bc9836e61..000000000
--- a/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-
-namespace ccl {
-struct context_comm_addr;
-}
-
-namespace native {
-
-class cluster_group_device_creator {
-    size_t process_index;
-    size_t process_size;
-    process_group_context& context;
-    device_storage& devices_factory;
-
-public:
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::cluster;
-    }
-
-    static constexpr const char* name() {
-        return "cluster_group_device_creator";
-    }
-
-    cluster_group_device_creator(size_t process_idx,
-                                 size_t process_nums,
-                                 process_group_context& ctx,
-                                 device_storage& devs);
-
-    static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                         const ccl_device& rhs);
-
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::process_device_indices_type& single_node_device_indices,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    bool build_all(std::ostream& out,
-                   const ccl::context_comm_addr& comm_addr,
-                   const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
-                   const detail::adjacency_matrix& single_node_matrix,
-                   detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    template <ccl::device_topology_type class_id>
-    bool build_impl(
-        std::ostream& out,
-        const ccl::context_comm_addr& comm_addr,
-        const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
-        const detail::adjacency_matrix& single_node_matrix,
-        const std::vector<std::vector<detail::colored_indexed_data<size_t>>>& syntetic_devices,
-        detail::colored_plain_graph_list& graph_list,
-        std::map<size_t, size_t> process_device_rank_offset,
-        size_t cluster_device_total_size,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-};
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp b/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp
deleted file mode 100644
index c0257a52b..000000000
--- a/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "cluster_group_device_creator.hpp"
-#include "common/comm/l0/topology/ring/ring_construction_utils.hpp"
-#include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-
-#include "common/comm/l0/topology/cluster_device_utils.hpp"
-
-namespace native {
-
-inline cluster_group_device_creator::cluster_group_device_creator(size_t process_idx,
-                                                                  size_t process_nums,
-                                                                  process_group_context& ctx,
-                                                                  device_storage& devs)
-        : process_index(process_idx),
-          process_size(process_nums),
-          context(ctx),
-          devices_factory(devs) {}
-
-inline size_t cluster_group_device_creator::default_property_p2p_rating_calculator(
-    const ccl_device& lhs,
-    const ccl_device& rhs) {
-    return detail::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
-}
-
-inline detail::adjacency_matrix cluster_group_device_creator::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::process_device_indices_type& single_node_device_indices,
-    detail::p2p_rating_function ping) {
-    // Build adjacency matrix with P2P capability:
-    // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
-    // element values - is a weight of P2P activity: 0 means - devices are not connected
-    // If values is not 0 - than two devies can be combined together
-
-    detail::adjacency_matrix ring_p2p_matrix;
-    if (single_node_device_indices.empty()) {
-        out << "No indices nothing to build" << std::endl;
-        return ring_p2p_matrix;
-    }
-
-    out << "Build adjacency matrix by: " << cluster_group_device_creator::name() << std::endl;
-    out << "Processes count: " << single_node_device_indices.size() << "\t";
-    out << "Delegate to thread group ring, consider 'process' as 'thread'" << std::endl;
-    return thread_group_ring_topology::build_p2p_capability_matrix(
-        out, single_node_device_indices, ping);
-}
-
-inline bool cluster_group_device_creator::build_all(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
-    const detail::adjacency_matrix& single_node_matrix,
-    detail::p2p_rating_function ping) {
-    out << "\n/************* \"" << cluster_group_device_creator::name()
-        << "\" for threads: " << context.process_device_topology.size() << "*************/\n"
-        << std::endl;
-
-    detail::plain_graph_list my_device_graphs = detail::graph_list_resolver(
-        single_node_matrix, cur_process_per_thread_device_indices, ping);
-
-    size_t size = my_device_graphs.size();
-    out << "Resolved graphs count: " << size << "\n";
-    if (!size) {
-        out << "Cannot build any ring" << std::endl;
-        return false;
-    }
-
-    out << "Transform graph to colored with process color: " << process_index << "\n";
-    detail::colored_plain_graph_list my_colored_graphs =
-        detail::create_colored(my_device_graphs, process_index);
-
-    out << "Process graphs:\n" << detail::to_string(my_colored_graphs) << std::endl;
-
-    detail::global_sorted_colored_plain_graphs global_graphs;
-    context.collect_cluster_colored_plain_graphs(my_colored_graphs, global_graphs);
-
-    //calculate my devicses offset (rank) from cluster devices
-    std::map<size_t, size_t> process_device_rank_offset;
-    size_t accumulated_offset = 0;
-    for (typename detail::global_sorted_colored_plain_graphs::value_type& process_graphs :
-         global_graphs) {
-        size_t process_num = process_graphs.first;
-        const detail::colored_plain_graph_list& proc_graphs = process_graphs.second;
-
-        process_device_rank_offset[process_num] = accumulated_offset; //offset for iter process
-        out << "Process idx: " << process_num << ", rank_offset: " << accumulated_offset
-            << std::endl;
-        for (const detail::colored_plain_graph& graph : proc_graphs) {
-            accumulated_offset += graph.size();
-        }
-    }
-    out << "Cluster device size: " << accumulated_offset << std::endl;
-
-    //check cluster topology on symmetric nodes configurations
-    bool symm_test = true;
-
-    // TODO consider situation, when we have symmetric process configuration
-    // But unsymmetric IPC devices count
-
-    std::vector<size_t> ipc_devices_on_node;
-    std::vector<size_t> processes_on_node;
-    ipc_devices_on_node.reserve(context.cluster_gpu_indices.size());
-    processes_on_node.reserve(context.cluster_gpu_indices.size());
-
-    ccl::device_indices_type shared_ipc_devices_candidates;
-    ccl::device_indices_type ipc_p2p_devices_candidates;
-    for (const auto& node_conf : context.cluster_gpu_indices) {
-        const ccl::host_id& hostname = node_conf.first;
-        const ccl::process_device_indices_type& processes = node_conf.second;
-
-        ccl::device_indices_type node_device_intersection; //shared devics
-
-        // each node should have the same processes count
-        if (!processes_on_node.empty()) {
-            symm_test &= (*processes_on_node.rbegin() == processes.size());
-
-            node_device_intersection = shared_ipc_devices_candidates;
-        }
-        else {
-            node_device_intersection = processes.begin()->second;
-        }
-        processes_on_node.push_back(processes.size());
-
-        //find shared devices for processes on node.
-        for (auto it = processes.begin(); it != processes.end() && symm_test; ++it) {
-            ccl::device_indices_type result_intersection;
-            std::set_intersection(it->second.begin(),
-                                  it->second.end(),
-                                  node_device_intersection.begin(),
-                                  node_device_intersection.end(),
-                                  std::inserter(result_intersection, result_intersection.end()));
-
-            symm_test &= result_intersection.size();
-
-            node_device_intersection.swap(result_intersection);
-        }
-
-        if (hostname == context.get_host_id() && symm_test) {
-            // remember ipc device candidates for my node
-            shared_ipc_devices_candidates = node_device_intersection;
-        }
-        //TODO - make smart logic: access each device to each for processes
-        // because not necesary to have shared device id for both processes
-
-        //common devices for processes on node
-        ipc_devices_on_node.push_back(node_device_intersection.size());
-    }
-
-    out << "Cluster Symmetric Capability:\n";
-    out << "\nNodes in cluster:\t" << context.cluster_gpu_indices.size();
-    out << "\nProcs on nodes:\t";
-    std::copy(processes_on_node.begin(),
-              processes_on_node.end(),
-              std::ostream_iterator<size_t>(out, ","));
-    out << "\nIPCs devices on nodes:\t";
-    std::copy(ipc_devices_on_node.begin(),
-              ipc_devices_on_node.end(),
-              std::ostream_iterator<size_t>(out, ","));
-    out << std::endl;
-
-    // additional device types to inject in a final topology
-    using thread_idx_t = size_t;
-    using colored_device_per_thread = detail::colored_indexed_data<thread_idx_t>;
-
-    std::vector<colored_device_per_thread> shared_ipc_devices;
-    size_t shared_ipc_links_per_proc = 0;
-    std::vector<colored_device_per_thread> scale_up_devices;
-    size_t scale_up_links_per_proc = 0;
-    std::vector<colored_device_per_thread> scale_out_devices;
-
-    //TODO only single thread supported - thread 0
-    thread_idx_t thread_index = 0;
-
-    // calculate scale-out links: by default each process use scale-out
-    size_t scale_out_links_per_proc = process_size;
-
-    // choose last device in my graph for scale-out ( why not?)
-    scale_out_devices.emplace_back(0 /*use default color as host communicator ( all-to-all)*/,
-                                   (my_colored_graphs.begin()->begin()->index),
-                                   thread_index);
-
-    // check topology optimization
-    if (symm_test && not shared_ipc_devices_candidates.empty()) {
-        out << "Symmetric Configuration Detected: ICP for scale-up" << std::endl;
-
-        //TODO schoose the first one
-        shared_ipc_devices.emplace_back(shared_ipc_devices_candidates.size() /*color*/,
-                                        *shared_ipc_devices_candidates.begin() /*device*/,
-                                        thread_index /*thread to insertion*/);
-
-        shared_ipc_links_per_proc = *processes_on_node.begin();
-        scale_out_links_per_proc = process_size / shared_ipc_links_per_proc;
-
-        if (scale_out_links_per_proc == 0 or scale_out_links_per_proc == 1) {
-            scale_out_links_per_proc = 0;
-            scale_out_devices.clear(); //no links, no devices
-        }
-        else {
-            // scale-out links exist, then recalcuate comm color
-            size_t scale_out_color = process_size % shared_ipc_links_per_proc;
-            std::for_each(scale_out_devices.begin(),
-                          scale_out_devices.end(),
-                          [scale_out_color](colored_device_per_thread& idx) {
-                              idx.color = scale_out_color;
-                          });
-        }
-    }
-    else {
-        out << "Unsymmetric IPC Configuration Detected" << std::endl;
-        size_t procs_on_node = *processes_on_node.begin();
-        bool process_symmetric_test =
-            (procs_on_node != 1); //nothing to scale-up for 1 process on node
-        process_symmetric_test &= std::all_of(
-            processes_on_node.begin(), processes_on_node.end(), [procs_on_node](size_t val) {
-                return procs_on_node == val;
-            });
-        if (process_symmetric_test) {
-            out << "Symmetric scale-up Configuration Detected. Build scale-up devices" << std::endl;
-
-            //TODO  assign first device in my graph for scale-up
-            size_t scale_up_color = std::hash<std::string>{}(context.get_host_id());
-            scale_up_devices.emplace_back(
-                scale_up_color, my_colored_graphs.begin()->begin()->index, thread_index);
-
-            scale_up_links_per_proc = procs_on_node;
-            scale_out_links_per_proc = process_size / scale_up_links_per_proc;
-
-            if (scale_out_links_per_proc == 0 or scale_out_links_per_proc == 1) {
-                scale_out_links_per_proc = 0;
-                scale_out_devices.clear(); //no links, no devices
-            }
-            else {
-                // change scale-out color to separate it from scale-up processes to use different communicator
-                size_t scale_out_color = process_size % scale_up_links_per_proc;
-                std::for_each(scale_out_devices.begin(),
-                              scale_out_devices.end(),
-                              [scale_out_color](colored_device_per_thread& idx) {
-                                  idx.color = scale_out_color;
-                              });
-
-                if (scale_out_color == scale_up_color) {
-                    //TODO
-                    out << "UNHANDLED CASE: scale-up & scale-out comm color is the same!\n"
-                        << "Reassign one of them: plus '1', re hash and broad-cast it!"
-                        << std::endl;
-                    abort();
-                }
-            }
-        }
-        else {
-            out << "Each nodes contains different processes. "
-                << "No optimization, use scale-out for all" << std::endl;
-
-            scale_out_links_per_proc = process_size;
-        }
-    }
-
-    out << "Final configuration info:\n";
-    out << "SHARED IPC: ";
-    for (const auto& idx : shared_ipc_devices) {
-        out << idx << ", ";
-    }
-    out << "\nshared ipc comm count: " << shared_ipc_links_per_proc;
-
-    out << "\nScaleUp: ";
-    for (const auto& idx : scale_up_devices) {
-        out << idx << ", ";
-    }
-    out << "\nscale-up comm count: " << scale_up_links_per_proc;
-
-    out << "\nScaleOut: ";
-    for (const auto& idx : scale_out_devices) {
-        out << idx << ", ";
-    }
-    out << "\nscale-out comm count: " << scale_out_links_per_proc << std::endl;
-
-    // enumerate as thread_group_devices, with syntetic device types injection
-    return build_impl<ccl::device_topology_type::ring>(
-        out,
-        comm_addr,
-        cur_process_per_thread_device_indices,
-        single_node_matrix,
-        { shared_ipc_devices, scale_up_devices, scale_out_devices },
-        my_colored_graphs,
-        process_device_rank_offset,
-        accumulated_offset,
-        ping);
-}
-
-template <ccl::device_topology_type class_id>
-inline bool cluster_group_device_creator::build_impl(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
-    const detail::adjacency_matrix& single_node_matrix,
-    const std::vector<std::vector<detail::colored_indexed_data<size_t>>>& syntetic_devices,
-    detail::colored_plain_graph_list& graph_list,
-    std::map<size_t, size_t> process_device_rank_offset,
-    size_t cluster_device_total_size,
-    detail::p2p_rating_function ping /* = default_property_p2p_rating_calculator*/) {
-    size_t ring_index = 0;
-    out << "Start building topology: " << ::to_string(class_id)
-        << ", for graphs: " << graph_list.size() << "\n"
-        << "ring index: " << ring_index << std::endl;
-    out << detail::to_string(graph_list);
-
-    auto& ctx_per_thread_data = context.process_device_topology;
-    (void)ctx_per_thread_data;
-
-    out << "\nStart indexer:" << std::endl;
-    size_t accumulated_index_offset_for_graph = 0;
-    size_t graph_num = 0;
-    std::map<size_t /*graph_num*/, size_t /*offset*/> index_offset_for_graphs;
-    auto offset_it = process_device_rank_offset.find(process_index);
-    if (offset_it == process_device_rank_offset.end()) {
-        assert(false && "");
-    }
-
-    accumulated_index_offset_for_graph = offset_it->second;
-    out << "My global rank offset: " << accumulated_index_offset_for_graph << std::endl;
-
-    std::set<ccl::device_index_type> created_cpu_context_indices;
-
-    // let's start numa-connector devices search & creation/
-    for (const auto& id_ring : graph_list) {
-        // todo
-        if (graph_list.size() == 1) {
-            // no NUMA in this case
-            break;
-        }
-        for (const auto& per_thread : cur_process_per_thread_device_indices) {
-            size_t thread_id = per_thread.first;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices_factory.thread_gpu_comms.find(thread_id)->second;
-            // create device comm wrappers and upgrade last devices in list up to numa type
-            detail::color_t process;
-            (void)process;
-            ccl::device_index_type last_in_graph_index;
-            const auto& tmp = *id_ring.rbegin();
-            process = tmp.color;
-            last_in_graph_index = tmp.index;
-            if (per_thread.second.find(last_in_graph_index) != per_thread.second.end()) {
-                CCL_ASSERT(process == process_index);
-                out << "thread: " << thread_id
-                    << " wants to create numa-proxy device by idx: " << last_in_graph_index
-                    << std::endl;
-                if (created_cpu_context_indices.find(last_in_graph_index) !=
-                    created_cpu_context_indices.end()) {
-                    out << "skip existing numa-proxy device candidate by: " << last_in_graph_index
-                        << std::endl;
-                    continue;
-                }
-
-                size_t inserted_device_type_index = detail::role_mod::inject_numa_device<
-                    group_id(),
-                    class_id,
-                    process_group_context,
-                    ccl_virtual_gpu_comm, /* `virtual` is better candiate*/
-                    ccl_gpu_comm>(
-                    *non_indexed_plain_devices, last_in_graph_index, context, devices_factory);
-                if (inserted_device_type_index == std::numeric_limits<size_t>::max()) {
-                    assert(false && "Unsupported device type in topology creation");
-                    std::ostringstream ss;
-                    ss << out.rdbuf();
-                    throw std::runtime_error(
-                        std::string("Unsupported device type in topology creation. Log:\n") +
-                        ss.str());
-                }
-                out << "Inject numa device by order: " << inserted_device_type_index
-                    << "\nby idx: " << last_in_graph_index << std::endl;
-                created_cpu_context_indices.insert(last_in_graph_index);
-            }
-        }
-    }
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-    auto topology_comm_addr = comm_addr;
-    topology_comm_addr.comm_size = cluster_device_total_size;
-    for (auto& id_ring : graph_list) {
-        size_t index_offset = accumulated_index_offset_for_graph;
-
-        for (auto per_thread_it = ctx_per_thread_data.begin();
-             per_thread_it != ctx_per_thread_data.end();
-             ++per_thread_it) {
-            size_t thread_id = per_thread_it->first; //first
-
-            // prepare ropology
-            std::shared_ptr<device_community<class_id>> out_indexed_devices;
-            if (graph_list.size() == 1) {
-                if (context.get_process_topology<class_id>(process_index, thread_id)
-                        .closed_rings.empty()) {
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .set_topology(
-                            std::make_shared<device_community<class_id>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .get_topology(ring_index);
-            }
-            else {
-                if (context.get_process_topology<class_id>(process_index, thread_id)
-                        .torn_apart_rings.empty()) {
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .set_additiona_topology(
-                            std::make_shared<device_community<class_id>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .get_additiona_topology(ring_index);
-            }
-
-            out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
-                << std::endl;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices_factory.thread_gpu_comms.find(thread_id)->second;
-
-            // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
-            auto rank_builder =
-                create_device_functor<detail::colored_graph_ring_indexer<group_id(), class_id>>(
-                    id_ring,
-                    thread_id,
-                    process_index,
-                    out_indexed_devices->get_device_storage(),
-                    0,
-                    0,
-                    index_offset);
-
-            ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-            // print partial topology enumeration for 'graph' from 'graph_list'
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
-            out << "Indexer result for devices in thread idx (" << thread_id << "/"
-                << ctx_per_thread_data.size() << "):\n"
-                << p.to_string() << std::endl;
-
-            // remember enumerated (marked) devices fro current thread & current graph
-            // to continue right enumeration order for other graphs & threas
-            accumulated_index_offset_for_graph +=
-                rank_builder.get_functor().get_marked_indices_count();
-            out << "\nIndexer for graph num: " << graph_num << ", finished. marked_indices: "
-                << rank_builder.get_functor().get_marked_indices_count()
-                << ", next rank index: " << accumulated_index_offset_for_graph << "\n";
-        }
-        index_offset_for_graphs[graph_num] = index_offset;
-        graph_num++;
-    }
-
-    out << "\nStart devices builder for graphs count: " << graph_list.size() << std::endl;
-    graph_num = 0;
-    for (const auto& id_ring : graph_list) {
-        out << "\nStart ring builder for graph num: " << graph_num << std::endl;
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            std::shared_ptr<device_community<class_id>> community;
-            if (graph_list.size() == 1) {
-                community =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_topology(ring_index);
-            }
-            else {
-                community =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_additiona_topology(ring_index);
-            }
-
-            auto& indexed_devices_for_current_thread = community->get_device_storage();
-
-            //find max device rank in current thread devices
-            const auto& curr_real =
-                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_virt =
-                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = detail::
-                get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_virt =
-                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                 group_id(),
-                                                 class_id>(indexed_devices_for_current_thread,
-                                                           id_ring);
-
-            size_t tg_max_rank = std::max({ std::get<0>(curr_real),
-                                            std::get<0>(curr_virt),
-                                            std::get<0>(curr_scale_real),
-                                            std::get<0>(curr_scale_virt) });
-            // find thread, which will connect to current thread max rank with next_rank
-            size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-
-            out << "Current thread: " << current_thread_idx
-                << ", max rank candidates: " << std::get<0>(curr_real) << ", "
-                << std::get<0>(curr_virt) << ", " << std::get<0>(curr_scale_real) << ", "
-                << std::get<0>(curr_scale_virt) << ", selected max rank: " << tg_max_rank
-                << ", expected next_rank: " << next_rank << std::endl;
-
-            for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-                 next_thread_id++) {
-                if (next_thread_id == current_thread_idx) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                // search next_rank in that thread
-                std::shared_ptr<device_community<class_id>> community;
-                if (graph_list.size() == 1) {
-                    community =
-                        context.get_process_topology<class_id>(process_index, next_thread_id)
-                            .get_topology(ring_index);
-                }
-                else {
-                    community =
-                        context.get_process_topology<class_id>(process_index, next_thread_id)
-                            .get_additiona_topology(ring_index);
-                }
-                auto& next_thread_ring_topology = community->get_device_storage();
-
-                const auto& real =
-                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& virt =
-                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& scale_real =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                const auto& scale_virt =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                if (next_rank != std::min({ std::get<0>(real),
-                                            std::get<0>(virt),
-                                            std::get<0>(scale_real),
-                                            std::get<0>(scale_virt) })) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                out << "next thread: " << next_thread_id
-                    << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                    << ", " << std::get<0>(scale_real) << ", " << std::get<0>(scale_virt)
-                    << std::endl;
-
-                out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                    << ")" << std::endl;
-                if (next_rank == std::get<0>(real)) {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
-                            next_rank,
-                            index_offset_for_graphs[graph_num],
-                            real,
-                            devices_factory,
-                            indexed_devices_for_current_thread);
-                    out << "Added real locker by index: "
-                        << index_offset_for_graphs[graph_num] + next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(virt)) {
-                    auto locker = detail::
-                        add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                            next_rank,
-                            index_offset_for_graphs[graph_num],
-                            virt,
-                            devices_factory,
-                            indexed_devices_for_current_thread);
-                    out << "Added virtual locker by index: "
-                        << index_offset_for_graphs[graph_num] + next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_real)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup  real proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_virt)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup virtual proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-            }
-        }
-        graph_num++;
-    }
-
-    out << "\nStart gpu types injection for graph list count: " << graph_list.size() << std::endl;
-    size_t syntetic_device_type_index = 0;
-    for (auto& colored_devices : syntetic_devices) {
-        switch (syntetic_device_type_index) {
-            case 0: //IPC device
-            {
-                for (const auto& idx : colored_devices) {
-                    size_t thread_id = idx.get_payload();
-
-                    std::shared_ptr<device_community<class_id>> community;
-                    if (graph_list.size() == 1) {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_topology(ring_index);
-                    }
-                    else {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_additiona_topology(ring_index);
-                    }
-
-                    auto& out_indexed_devices = community->get_device_storage();
-
-                    size_t inserted_device_type_index =
-                        detail::role_mod::inject_ipc_src_device<group_id(),
-                                                       class_id,
-                                                       process_group_context,
-                                                       ccl_gpu_comm,
-                                                       ccl_virtual_gpu_comm
-                                                       /*,
-                                                        Too complex to support such topology without generic topology builder
-                                                       ccl_numa_proxy<ccl_gpu_comm>,
-                                                       ccl_numa_proxy<ccl_virtual_gpu_comm>
-                                                       */>(
-                            out_indexed_devices, idx.index, context, devices_factory);
-                    if (inserted_device_type_index != std::numeric_limits<size_t>::max()) {
-                        out << "Inject IPC_src device by order: " << inserted_device_type_index
-                            << "\nby idx: " << idx.to_string() << std::endl;
-                    }
-                    else {
-                        abort();
-                        assert(false && "Unsupported device type in topology creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(
-                            std::string("Unsupported device type in topology creation. Log:\n") +
-                            ss.str());
-                    }
-                }
-                syntetic_device_type_index++;
-                break;
-            }
-            case 1: //scale-up device
-            {
-                for (const auto& idx : colored_devices) {
-                    size_t thread_id = idx.get_payload();
-
-                    std::shared_ptr<device_community<class_id>> community;
-                    if (graph_list.size() == 1) {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_topology(ring_index);
-                    }
-                    else {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_additiona_topology(ring_index);
-                    }
-
-                    auto& out_indexed_devices = community->get_device_storage();
-
-                    size_t inserted_device_type_index = detail::role_mod::inject_scaleup_device<
-                        group_id(),
-                        class_id,
-                        process_group_context,
-                        ccl_gpu_comm,
-                        ccl_virtual_gpu_comm,
-                        ccl_numa_proxy<ccl_gpu_comm>,
-                        ccl_numa_proxy<ccl_virtual_gpu_comm>>(
-                        out_indexed_devices, idx.index, context, devices_factory);
-                    if (inserted_device_type_index != std::numeric_limits<size_t>::max()) {
-                        out << "Inject scaleUp device by order: " << inserted_device_type_index
-                            << "\nby idx: " << idx.to_string() << std::endl;
-                    }
-                    else {
-                        abort();
-                        assert(false && "Unsupported device type in topology creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(
-                            std::string("Unsupported device type in topology creation. Log:\n") +
-                            ss.str());
-                    }
-                }
-                syntetic_device_type_index++;
-                break;
-            }
-            case 2: //scale-out device
-            {
-                for (const auto& idx : colored_devices) {
-                    size_t thread_id = idx.get_payload();
-                    std::shared_ptr<device_community<class_id>> community;
-                    if (graph_list.size() == 1) {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_topology(ring_index);
-                    }
-                    else {
-                        community = context.get_process_topology<class_id>(process_index, thread_id)
-                                        .get_additiona_topology(ring_index);
-                    }
-
-                    auto& out_indexed_devices = community->get_device_storage();
-
-                    size_t inserted_device_type_index = detail::role_mod::inject_scaleout_device<
-                        group_id(),
-                        class_id,
-                        process_group_context,
-                        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>,
-                        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>,
-                        ccl_gpu_scaleup_proxy<ccl_gpu_comm>,
-                        ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>,
-                        ccl_numa_proxy<ccl_gpu_comm>,
-                        ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                        ccl_gpu_comm,
-                        ccl_virtual_gpu_comm>(
-                        out_indexed_devices, idx.index, context, devices_factory);
-                    if (inserted_device_type_index != std::numeric_limits<size_t>::max()) {
-                        out << "Inject scaleout device by order: " << inserted_device_type_index
-                            << "\nby idx: " << idx.to_string() << std::endl;
-                    }
-                    else {
-                        abort();
-                        assert(false && "Unsupported device type in topology creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(
-                            std::string("Unsupported device type in topology creation. Log:\n") +
-                            ss.str());
-                    }
-                }
-                syntetic_device_type_index++;
-                break;
-            }
-            default:
-                throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                         "Unexpected injected device index: " +
-                                         std::to_string(syntetic_device_type_index));
-        }
-    }
-
-    out << "\nFinished building topology: " << ::to_string(class_id) << std::endl;
-    for (auto per_thread_it = ctx_per_thread_data.begin();
-         per_thread_it != ctx_per_thread_data.end();
-         ++per_thread_it) {
-        size_t thread_id = per_thread_it->first;
-
-        detail::printer<group_id(), class_id> p;
-
-        std::shared_ptr<device_community<class_id>> community;
-        if (graph_list.size() == 1) {
-            community = context.get_process_topology<class_id>(process_index, thread_id)
-                            .get_topology(ring_index);
-        }
-        else {
-            community = context.get_process_topology<class_id>(process_index, thread_id)
-                            .get_additiona_topology(ring_index);
-        }
-
-        auto& out_indexed_devices = community->get_device_storage();
-
-        ccl_tuple_for_each(out_indexed_devices, p);
-        out << "\nFinal topology thread: " << thread_id << "\n" << p.to_string();
-    }
-
-    return true;
-}
-
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp
deleted file mode 100644
index eb17683e8..000000000
--- a/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/topology/ring/ring_construction_utils.hpp"
-#include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
-
-namespace native {
-
-device_group_ring_topology::device_group_ring_topology(device_group_context& comm,
-                                                       device_storage& devs)
-        : context(comm),
-          devices_factory(devs) {}
-
-size_t device_group_ring_topology::default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                                          const ccl_device& rhs) {
-    return detail::property_p2p_rating_calculator(lhs, rhs, DEVICE_GROUP_WEIGHT);
-}
-
-detail::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::device_indices_type& group_device_indices,
-    detail::p2p_rating_function ping) {
-    // Build adjacency matrix between devices using `ping` function:
-    // Default ping function is checking P2P access capabilities in a way:
-    // 1) Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
-    // 2) Matrix element values - is P2P access score: 0 means - devices are not connected
-    // If values is not 0 - than two devies can access either together
-
-    out << "Build adjacency matrix by: " << device_group_ring_topology::name()
-        << " - group indices count: " << group_device_indices.size() << std::endl;
-
-    //Request for alldevices in all allied processes on the node
-    return get_platform().calculate_device_access_metric(group_device_indices, ping);
-}
-
-detail::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::device_mask_t& group_device_masks,
-    detail::p2p_rating_function ping) {
-    // Build adjacency matrix between devices using `ping` function:
-    // Default ping function is checking P2P access capabilities in a way:
-    // 1) Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
-    // 2) Matrix element values - is P2P access score: 0 means - devices are not connected
-    // If values is not 0 - than two devies can access either together
-
-    out << "Group mask mask: " << group_device_masks << std::endl;
-    return build_p2p_capability_matrix(
-        out, native::ccl_device_driver::get_device_indices(group_device_masks), ping);
-}
-
-bool device_group_ring_topology::build(std::ostream& out,
-                                       const ccl::context_comm_addr& comm_addr,
-                                       const ccl::device_indices_type& group_device_indices,
-                                       const detail::adjacency_matrix& matrix) {
-    out << "\n/*************\"" << device_group_ring_topology::name() << "\"*************/\n"
-        << std::endl;
-
-    out << "Resolve device graph" << std::endl;
-    detail::plain_graph_list id_rings = graph_list_resolver(matrix, group_device_indices);
-
-    size_t size = id_rings.size();
-    out << "Resolved graphs count: " << size << "\n";
-    if (!size) {
-        out << "Cannot build any ring" << std::endl;
-        return false;
-    }
-    else if (id_rings.size() == 1) // whole ring, each device is accessible, no CPU copy here
-    {
-        return build_specific(out, comm_addr, group_device_indices, *id_rings.begin(), matrix);
-    }
-
-    /* torn-apart ring:
-     * there are inaccessible devices in group - need to insert broadcast devices wrappers for
-     * CPU RAM copying
-     */
-    return build_scale_up_specific(out, comm_addr, group_device_indices, id_rings, matrix);
-}
-
-bool device_group_ring_topology::build(std::ostream& out,
-                                       const ccl::context_comm_addr& comm_addr,
-                                       const ccl::device_mask_t& group_device_masks,
-                                       const detail::adjacency_matrix& matrix) {
-    return build(
-        out, comm_addr, native::ccl_device_driver::get_device_indices(group_device_masks), matrix);
-}
-
-template <ccl::device_topology_type class_id>
-bool device_group_ring_topology::build_specific_topology(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_type& group_device_indices,
-    const detail::plain_graph& graph) {
-    out << "Start building topology: " << ::to_string(class_id) << ", for graph:\n";
-    out << detail::to_string(graph);
-
-    size_t thread_id = comm_addr.thread_idx;
-    auto topology_comm_addr = comm_addr;
-    topology_comm_addr.comm_size = graph.size();
-    auto device_topology = std::make_shared<device_community<class_id>>(topology_comm_addr);
-
-    out << "\nStart indexer for thread: " << thread_id << std::endl;
-    detail::id_thread_table assigned_ids;
-    std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(graph);
-    auto rank_builder = create_device_functor<detail::graph_ring_indexer<group_id(), class_id>>(
-        marked_id_ring, assigned_ids, thread_id, device_topology->get_device_storage());
-    std::shared_ptr<specific_plain_device_storage> group_gpu_comms =
-        devices_factory.create_devices_by_indices(thread_id, group_device_indices);
-
-    ccl_tuple_for_each(*group_gpu_comms, rank_builder);
-
-    detail::printer<group_id(), class_id> p;
-    ccl_tuple_for_each(*group_gpu_comms, p);
-    out << "Indexer result: \n" << p.to_string();
-
-    out << "\nFinished building topology: " << ::to_string(class_id) << std::endl;
-
-    //remember
-    context.device_topology.get_community<class_id>().set_topology(device_topology);
-    return true;
-}
-
-bool device_group_ring_topology::build_specific(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_type& group_device_indices,
-    const detail::plain_graph& graph,
-    const detail::adjacency_matrix& matrix) {
-    bool result = build_specific_topology<ccl::device_topology_type::ring>(
-        out, comm_addr, group_device_indices, graph);
-    /*
-    // check a2a possibility
-    bool a2a_capable = detail::check_graph_a2a_capable(graph, matrix,out);
-    if (a2a_capable)
-    {
-        // a2a should starts from real device
-        // if do not reset, than it continue creation from existing ring devices
-        devices_factory.reset(comm_addr.thread_idx);      <--- AVOID because affects thread_group
-
-        a2a_capable =
-            build_specific_topology<ccl::group_split_type::a2a_device_group>(out,
-                                                                                 comm_addr,
-                                                                                 group_device_indices,
-                                                                                 graph);
-    }
-
-    return result || a2a_capable;*/
-    return result;
-}
-
-template <ccl::device_topology_type class_id>
-bool device_group_ring_topology::build_scale_up_specific_topology(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_type& group_device_indices,
-    const detail::plain_graph_list& graph_list) {
-    out << "Start building topology: " << ::to_string(class_id)
-        << ", for graphs: " << graph_list.size() << "\n";
-    out << detail::to_string(graph_list);
-
-    size_t thread_id = comm_addr.thread_idx;
-    size_t graph_num = 0;
-    size_t index_offset = 0;
-
-    // create all required device wrappers
-    // these wrappers would be used for ALL context at the next iteration
-    ccl::device_indices_type total_device_indices;
-    for (const auto& graph : graph_list) {
-        total_device_indices.insert(graph.begin(), graph.end());
-    }
-    std::shared_ptr<specific_plain_device_storage> initial_group_gpu_comms =
-        devices_factory.create_devices_by_indices(thread_id, total_device_indices);
-    //set lobal devices size to topology
-    auto topology_comm_addr = comm_addr;
-    topology_comm_addr.comm_size = total_device_indices.size();
-    auto device_topology = std::make_shared<device_community<class_id>>(topology_comm_addr);
-
-    // make copy for wrappers, because other context should work with original structure
-    // but current context modified it (transform some wrappers into new scale_up_wrapper type)
-    std::shared_ptr<specific_plain_device_storage> group_gpu_comms =
-        std::make_shared<specific_plain_device_storage>(*initial_group_gpu_comms);
-    for (const auto& graph : graph_list) {
-        out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
-            << std::endl;
-
-        detail::id_thread_table assigned_ids;
-        std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(graph);
-        auto rank_builder =
-            create_device_functor<detail::graph_ring_indexer_unique_index<group_id(), class_id>>(
-                marked_id_ring,
-                assigned_ids,
-                thread_id,
-                device_topology->get_device_storage(),
-                index_offset,
-                0,
-                0);
-        // promote real-virtual device (right corner devices) in graphs up to scale_up_proxy type
-        // all local group devices in different graph would be linked by scale_up_proxy
-        // each local group ( in graph) must have at least one scale_up_proxy device
-        const ccl::device_index_type& last_in_graph_index = *graph.rbegin();
-        size_t inserted_device_type_index = detail::role_mod::inject_numa_device<
-            group_id(),
-            class_id,
-            device_group_context,
-            ccl_virtual_gpu_comm, /* `virtual` is better candiate*/
-            ccl_gpu_comm>(*group_gpu_comms, last_in_graph_index, context, devices_factory);
-        if (inserted_device_type_index == std::numeric_limits<size_t>::max()) {
-            assert(false && "Unsupported device type in topology creation");
-            std::ostringstream ss;
-            ss << out.rdbuf();
-            throw std::runtime_error(
-                std::string("Unsupported device type in topology creation. Log:\n") + ss.str());
-        }
-        out << "Inject numa device by order: " << inserted_device_type_index
-            << "\nby idx: " << last_in_graph_index << std::endl;
-        /* use plain (non-indexed) device wrapper list, which is allocated from device_storage
-         * in the following way:
-         *
-         * Try to iterate over all wrappers in that list for devices allocated for that device_group.
-         * Find wrapper by device_id in each 'graph' list.
-         * Offset for founded device from graph beginning give us 'rank' for founded device.
-         * By 'rank' - means logical rank in local device group ( local for ring kernel execution)
-         * Need to remember about previous graphs sizes, when calculate offset... total offset for founded * device in total graphs is a 'user rank' for device in process/cluster
-         */
-        ccl_tuple_for_each(*group_gpu_comms, rank_builder);
-
-        // just print partial topology progress for current 'graph'
-        detail::printer<group_id(), class_id> p;
-        ccl_tuple_for_each(device_topology->get_device_storage(), p);
-        out << "\nIndexer for graph num: " << graph_num++ << ", result: \n" << p.to_string();
-
-        index_offset += graph.size();
-    }
-
-    out << "\nFinished building topology: " << ::to_string(class_id) << std::endl;
-
-    // remember constructed topology
-    context.device_topology.get_community<class_id>().set_additiona_topology(device_topology);
-
-    detail::printer<group_id(), class_id> p;
-    ccl_tuple_for_each(device_topology->get_device_storage(), p);
-    out << "\nFinal topology: \n" << p.to_string();
-    return true;
-}
-
-bool device_group_ring_topology::build_scale_up_specific(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_type& group_device_indices,
-    const detail::plain_graph_list& graph_list,
-    const detail::adjacency_matrix& matrix) {
-    bool result = build_scale_up_specific_topology<ccl::device_topology_type::ring>(
-        out, comm_addr, group_device_indices, graph_list);
-    /*
-    // check a2a possibility
-    bool a2a_capable = true;
-    for (const auto& graph : graph_list)
-    {
-        a2a_capable &= detail::check_graph_a2a_capable(graph, matrix, out);
-    }
-
-    if (a2a_capable)
-    {
-        // a2a should starts from real device
-        // if do not reset, than it continue creation from existing ring devices
-        devices_factory.reset(comm_addr.thread_idx);  <--- AVOID because affects thread_group
-
-        a2a_capable =
-            build_scale_up_specific_topology<ccl::group_split_type::a2a_device_group>(
-                                                out,
-                                                comm_addr,
-                                                group_device_indices,
-                                                graph_list);
-    }
-
-    return result || a2a_capable;
-    */
-    return result;
-}
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp
deleted file mode 100644
index 26962f1ba..000000000
--- a/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-
-namespace ccl {
-struct context_comm_addr;
-}
-
-namespace native {
-
-class device_group_ring_topology {
-    device_group_context& context;
-    device_storage& devices_factory;
-
-public:
-    device_group_ring_topology(device_group_context& comm, device_storage& devs);
-
-    static constexpr const char* name() {
-        return "device_group_ring_creator";
-    }
-
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::thread;
-    }
-
-    static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                         const ccl_device& rhs);
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::device_indices_type& group_device_indices,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::device_mask_t& group_device_masks,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    bool build(std::ostream& out,
-               const ccl::context_comm_addr& comm_addr,
-               const ccl::device_mask_t& group_device_masks,
-               const detail::adjacency_matrix& matrix);
-    bool build(std::ostream& out,
-               const ccl::context_comm_addr& comm_addr,
-               const ccl::device_indices_type& group_device_indices,
-               const detail::adjacency_matrix& matrix);
-
-private:
-    bool build_specific(std::ostream& out,
-                        const ccl::context_comm_addr& comm_addr,
-                        const ccl::device_indices_type& group_device_indices,
-                        const detail::plain_graph& graph,
-                        const detail::adjacency_matrix& matrix);
-
-    template <ccl::device_topology_type topology_type>
-    bool build_specific_topology(std::ostream& out,
-                                 const ccl::context_comm_addr& comm_addr,
-                                 const ccl::device_indices_type& group_device_indices,
-                                 const detail::plain_graph& graph);
-
-    bool build_scale_up_specific(std::ostream& out,
-                                 const ccl::context_comm_addr& comm_addr,
-                                 const ccl::device_indices_type& group_device_indices,
-                                 const detail::plain_graph_list& graph_list,
-                                 const detail::adjacency_matrix& matrix);
-
-    template <ccl::device_topology_type topology_type>
-    bool build_scale_up_specific_topology(std::ostream& out,
-                                          const ccl::context_comm_addr& comm_addr,
-                                          const ccl::device_indices_type& group_device_indices,
-                                          const detail::plain_graph_list& graph);
-};
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp
deleted file mode 100644
index 9f0334b4c..000000000
--- a/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp
+++ /dev/null
@@ -1,2573 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/topology/ring/ring_construction_utils.hpp"
-#include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-
-#include "common/comm/l0/topology/topology_serializer.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-namespace native {
-
-allied_process_group_ring_topology::allied_process_group_ring_topology(
-    size_t process_idx,
-    size_t process_nums,
-    process_group_context& ctx,
-    device_storage& devs,
-    size_t cluster_rank_offset,
-    size_t cluster_size,
-    const ccl::context_comm_addr& comm_addr)
-        : process_index(process_idx),
-          process_count(process_nums),
-          context(ctx),
-          devices(devs),
-          device_cluster_rank_offset(cluster_rank_offset),
-          device_cluster_size(cluster_size),
-          ctx_comm_addr(comm_addr) {}
-
-size_t allied_process_group_ring_topology::default_property_p2p_rating_calculator(
-    const ccl_device& lhs,
-    const ccl_device& rhs) {
-    return detail::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
-}
-
-std::pair<size_t, size_t> allied_process_group_ring_topology::calculate_rank_offset_with_size(
-    size_t process_id,
-    const std::string& host_id,
-    const ccl::cluster_aggregated_device_mask_t& cluster_affinity_mask) {
-    auto from_begin = [](const ccl::process_aggregated_device_mask_t& processes) ->
-        typename ccl::process_aggregated_device_mask_t::const_iterator {
-            return processes.begin();
-        };
-    auto from_my_rank = [process_id](const ccl::process_aggregated_device_mask_t& processes) ->
-        typename ccl::process_aggregated_device_mask_t::const_iterator {
-            return processes.lower_bound(process_id);
-        };
-
-    auto till_my_rank = from_my_rank;
-    auto till_end = [](const ccl::process_aggregated_device_mask_t& processes) ->
-        typename ccl::process_aggregated_device_mask_t::const_iterator {
-            return processes.end();
-        };
-
-    auto device_summator =
-        [](size_t part_sum,
-           const ccl::process_aggregated_device_mask_t::value_type& mask) -> size_t {
-        return part_sum + mask.second.count();
-    };
-
-    auto left_rank_summator =
-        [from_begin, till_my_rank, device_summator](
-            size_t part_sum,
-            const typename ccl::cluster_aggregated_device_mask_t::value_type& processes_pair)
-        -> size_t {
-        return std::accumulate(from_begin(processes_pair.second),
-                               till_my_rank(processes_pair.second),
-                               part_sum,
-                               device_summator);
-    };
-    auto right_rank_summator =
-        [from_my_rank, till_end, device_summator](
-            size_t part_sum,
-            const typename ccl::cluster_aggregated_device_mask_t::value_type& processes_pair)
-        -> size_t {
-        return std::accumulate(from_my_rank(processes_pair.second),
-                               till_end(processes_pair.second),
-                               part_sum,
-                               device_summator);
-    };
-    auto rank_summator =
-        [from_begin, till_end, device_summator](
-            size_t part_sum,
-            const typename ccl::cluster_aggregated_device_mask_t::value_type& processes_pair)
-        -> size_t {
-        return std::accumulate(from_begin(processes_pair.second),
-                               till_end(processes_pair.second),
-                               part_sum,
-                               device_summator);
-    };
-
-    //calculate ranks offset: summ of devices for each process for each node
-    //TODO node sorted by lexicographic comparison
-    auto my_node_it = cluster_affinity_mask.find(host_id);
-
-    size_t my_node_rank_devices_offset =
-        std::accumulate(cluster_affinity_mask.begin(), my_node_it, 0, rank_summator);
-    my_node_rank_devices_offset = std::accumulate(
-        my_node_it, std::next(my_node_it), my_node_rank_devices_offset, left_rank_summator);
-
-    size_t cluster_devices_count = std::accumulate(
-        my_node_it, std::next(my_node_it), my_node_rank_devices_offset, right_rank_summator);
-    cluster_devices_count = std::accumulate(
-        my_node_it, cluster_affinity_mask.end(), cluster_devices_count, rank_summator);
-    return { my_node_rank_devices_offset, cluster_devices_count };
-}
-
-detail::adjacency_matrix allied_process_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::process_aggregated_device_mask_t& node_device_masks,
-    detail::p2p_rating_function ping) {
-    ccl::process_device_indices_type per_process_device_indices;
-    for (const auto& mask : node_device_masks) {
-        per_process_device_indices.insert(
-            { mask.first, ccl_device_driver::get_device_indices(mask.second) });
-    }
-
-    return build_p2p_capability_matrix(out, per_process_device_indices, ping);
-}
-
-detail::adjacency_matrix allied_process_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::process_device_indices_type& node_device_indices,
-    detail::p2p_rating_function ping) {
-    // Build adjacency matrix with P2P capability:
-    // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
-    // element values - is a weight of P2P activity: 0 means - devices are not connected
-    // If values is not 0 - than two devies can be combined together
-
-    detail::adjacency_matrix ring_p2p_matrix;
-    if (node_device_indices.empty()) {
-        out << "No indices nothing to build" << std::endl;
-        return ring_p2p_matrix;
-    }
-
-    out << "Build adjacency matrix by: " << allied_process_group_ring_topology::name() << std::endl;
-    out << "Processes count: " << node_device_indices.size() << "\t";
-    out << "Delegate to thread group ring" << std::endl;
-    return thread_group_ring_topology::build_p2p_capability_matrix(out, node_device_indices, ping);
-}
-
-bool allied_process_group_ring_topology::build_all(
-    std::ostream& out,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const detail::adjacency_matrix& matrix,
-    detail::p2p_rating_function ping) {
-    const std::string& threads_indices_str = ccl::to_string(per_thread_device_indices);
-    LOG_DEBUG("\n/************* \"",
-              allied_process_group_ring_topology::name(),
-              "\" for threads: ",
-              context.process_device_topology.size(),
-              "*************/\n");
-
-    LOG_DEBUG("Process: ", process_index, ", threads indices: ", threads_indices_str);
-    out << "Build process group device graphs, from threads: " << per_thread_device_indices.size()
-        << ", threads indices: \n"
-        << threads_indices_str << std::endl;
-
-    detail::plain_graph_list my_rings =
-        create_my_process_graphs(per_thread_device_indices, matrix, ping);
-    size_t size = my_rings.size();
-    LOG_DEBUG("Resolved graphs count: ", size, ", process_index: ", process_index);
-    if (!size) {
-        out << "Cannot build any ring" << std::endl;
-        return false;
-    }
-
-    {
-        const std::string& graph_to_str = detail::to_string(my_rings);
-        out << "Graph for process: " << process_index << "\n";
-        out << graph_to_str << std::endl;
-
-        LOG_DEBUG("Graph for process: ", process_index, " resolved:\n", graph_to_str);
-    }
-
-    out << "Transform graph to colored with process color: " << process_index << "\n";
-    detail::colored_plain_graph_list my_colored_ring =
-        detail::create_colored(my_rings, process_index);
-
-    detail::global_sorted_colored_plain_graphs global_graphs;
-    context.collect_cluster_colored_plain_graphs(my_colored_ring, global_graphs);
-
-    std::map<size_t, size_t> process_device_rank_offset;
-    size_t accumulated_offset = 0;
-
-    out << "Print ranks offset in cluster for global graphs: " << global_graphs.size() << std::endl;
-    for (typename detail::global_sorted_colored_plain_graphs::value_type& process_graphs :
-         global_graphs) {
-        size_t process_num = process_graphs.first;
-        const detail::colored_plain_graph_list& proc_graphs = process_graphs.second;
-
-        process_device_rank_offset[process_num] = accumulated_offset; //offset for iter process
-        out << "Process idx: " << process_num << ", rank_offset: " << accumulated_offset
-            << std::endl;
-        for (const detail::colored_plain_graph& graph : proc_graphs) {
-            accumulated_offset += graph.size();
-        }
-    }
-
-    out << "Cluster device size: " << accumulated_offset << std::endl;
-    detail::global_colored_plain_graphs merged_cluster_graphs =
-        merge_allied_nodes_in_colored_plain_graphs(
-            out, context.cluster_gpu_indices, process_index, process_count, global_graphs, ping);
-
-    const std::string& merged_cluster_graphs_str = detail::to_string(merged_cluster_graphs);
-    LOG_INFO("Cluster merged graphs process idx: ",
-             process_index,
-             " result:\n",
-             merged_cluster_graphs_str);
-    out << "Cluster merged graphs result on process idx: " << process_index << std::endl;
-    out << merged_cluster_graphs_str << std::endl;
-
-    detail::colored_plain_graph_list my_merged_rings = resize_merged_colored_graphs_for_process(
-        process_index, process_count, merged_cluster_graphs, my_colored_ring, out);
-
-    const std::string& my_merged_rings_str = detail::to_string(my_merged_rings);
-    LOG_INFO("Resized merged graph list on process idx: ",
-             process_index,
-             " result:\n",
-             my_merged_rings_str);
-    out << "Resized merged graph list on process idx: " << process_index << std::endl;
-
-    out << "Notify merged graphs changes for cluster\n";
-    detail::global_sorted_colored_plain_graphs global_merged_graphs;
-    context.collect_cluster_colored_plain_graphs(my_merged_rings, global_merged_graphs);
-
-    ccl::process_device_indices_type scaleout_devices =
-        create_scaleout_devices_in_colored_graphs_for_process(
-            process_index, process_count, global_merged_graphs, global_graphs, out);
-    const std::string& scaleout_devices_str = ccl::to_string(scaleout_devices);
-    LOG_INFO("Collected scaleout devices on process: ",
-             process_index,
-             " result:\n",
-             scaleout_devices_str);
-    out << "Collected scaleout_devices: \n";
-    out << scaleout_devices_str << std::endl;
-
-    ccl::process_device_indices_type ipc_devices = create_ipc_devices_in_colored_graphs_for_process(
-        process_index, process_count, global_merged_graphs, global_graphs, out);
-    const std::string& ipc_devices_str = ccl::to_string(ipc_devices);
-    LOG_INFO("Collected IPC devices on process: ", process_index, " result:\n", ipc_devices_str);
-    out << "Collected ipc_devices: \n";
-    out << ipc_devices_str << std::endl;
-
-    // enumerate as usual
-    if (scaleout_devices.empty()) {
-        size_t size = my_merged_rings.size();
-        out << "Resolved graphs count: " << size << "\n";
-        if (!size) {
-            out << "Cannot build any ring" << std::endl;
-            return false;
-        }
-        else if (size == 1) // whole ring
-        {
-            return build_specific_colored(out,
-                                          per_thread_device_indices,
-                                          ipc_devices,
-                                          *my_merged_rings.begin(),
-                                          process_device_rank_offset);
-        }
-        //torn-apart ring
-        return build_specific_scale_up(out,
-                                       per_thread_device_indices,
-                                       ipc_devices,
-                                       my_merged_rings,
-                                       process_device_rank_offset);
-    }
-    else if (ipc_devices.empty()) {
-        //pure scale-out
-        return build_specific_scale_out_only(out,
-                                             per_thread_device_indices,
-                                             scaleout_devices,
-                                             my_merged_rings,
-                                             process_device_rank_offset);
-    }
-    else {
-        throw std::runtime_error(
-            "torn-apart ring with scaleout\n"
-            "return build_specific_scale_up_out(out, per_thread_device_indices,\n"
-            "scaleout_devices, ipc_devices,\n"
-            "my_merged_rings, process_device_rank_offset)\n"
-            "UNSUPPORTED");
-    }
-    return false;
-}
-
-detail::plain_graph_list allied_process_group_ring_topology::create_my_process_graphs(
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const detail::adjacency_matrix& matrix,
-    detail::p2p_rating_function ping) {
-    return detail::graph_list_resolver(matrix, per_thread_device_indices, ping);
-}
-detail::global_sorted_plain_graphs allied_process_group_ring_topology::collect_cluster_plain_graphs(
-    std::ostream& out,
-    std::shared_ptr<ccl::host_communicator> comm,
-    size_t process_index,
-    const detail::plain_graph_list& my_process_graph) {
-    using namespace detail::serialize;
-
-    out << "Collect cluster plain graphs, my process index: " << process_index
-        << ", graphs count: " << my_process_graph.size() << std::endl;
-
-    std::vector<size_t> recv_process_indices_counts(comm->size(), 1);
-    device_path_serializable::raw_data_t my_serialized_graph =
-        device_path_serializer::serialize_indices(my_process_graph);
-
-    size_t send_count = my_serialized_graph.size();
-    std::vector<size_t> receive_process_graph_sizes(comm->size());
-
-    out << "Ask graph lists sizes by process index: " << process_index
-        << ", serialized size: " << send_count << std::endl;
-    ccl::stream::impl_value_t empty_stream{};
-    auto req = comm->allgatherv_impl(&send_count,
-                                     1,
-                                     receive_process_graph_sizes.data(),
-                                     recv_process_indices_counts,
-                                     empty_stream,
-                                     ccl::default_allgatherv_attr,
-                                     {});
-
-    req.wait();
-    size_t global_graph_data_size =
-        std::accumulate(receive_process_graph_sizes.begin(), receive_process_graph_sizes.end(), 0);
-
-    device_path_serializable::raw_data_t global_serialized_graph;
-    try {
-        out << "Send graph list by process index: " << process_index
-            << ", serialized size: " << send_count << std::endl;
-
-        global_serialized_graph.resize(global_graph_data_size);
-        req = comm->allgatherv_impl(reinterpret_cast<void*>(my_serialized_graph.data()),
-                                    send_count,
-                                    reinterpret_cast<void*>(global_serialized_graph.data()),
-                                    receive_process_graph_sizes,
-                                    ccl::datatype::int8,
-                                    empty_stream,
-                                    ccl::default_allgatherv_attr,
-                                    {});
-        req.wait();
-    }
-    catch (const std::exception& ex) {
-        out << "Cannot submit global-serialized-graph requests " << ex.what() << std::endl;
-        out << "Memory required for hostnames size: " << global_graph_data_size << " bytes\n";
-        abort();
-    }
-
-    size_t deserialized_bytes = 0;
-    size_t offset_bytes = 0;
-    detail::global_sorted_plain_graphs global_ret;
-
-    out << "Deserialize graph_lists" << std::endl;
-    for (size_t i = 0; i < static_cast<size_t>(comm->size()); i++) {
-        detail::plain_graph_list graph = device_path_deserializer::deserialize_graph_list_indices(
-            global_serialized_graph, deserialized_bytes, offset_bytes);
-        out << "Process index: " << i << ", deserialized bytes: " << deserialized_bytes
-            << ", by offset: " << offset_bytes << std::endl;
-
-        global_ret.emplace(i, std::move(graph));
-    }
-
-    out << "Global graph deserialized on process: " << process_index << std::endl;
-    return global_ret;
-}
-
-detail::global_sorted_colored_plain_graphs
-allied_process_group_ring_topology::collect_cluster_colored_plain_graphs(
-    std::ostream& out,
-    std::shared_ptr<ccl::host_communicator> comm,
-    size_t process_index,
-    const detail::colored_plain_graph_list& my_process_graph) {
-    using namespace detail::serialize;
-
-    out << "Collect cluster colored plain graphs, my process index: " << process_index
-        << ", graphs count: " << my_process_graph.size() << std::endl;
-
-    std::vector<size_t> recv_process_indices_counts(comm->size(), 1);
-    device_path_serializable::raw_data_t my_serialized_graph =
-        device_path_serializer::serialize_indices(my_process_graph);
-
-    size_t send_count = my_serialized_graph.size();
-    std::vector<size_t> receive_process_graph_sizes(comm->size());
-
-    out << "Ask graph lists sizes by process index: " << process_index
-        << ", serialized size: " << send_count << std::endl;
-    ccl::stream::impl_value_t empty_stream{};
-    auto req = comm->allgatherv_impl(&send_count,
-                                     1,
-                                     receive_process_graph_sizes.data(),
-                                     recv_process_indices_counts,
-                                     empty_stream,
-                                     ccl::default_allgatherv_attr,
-                                     {});
-
-    req.wait();
-    size_t global_graph_data_size =
-        std::accumulate(receive_process_graph_sizes.begin(), receive_process_graph_sizes.end(), 0);
-
-    device_path_serializable::raw_data_t global_serialized_graph;
-    try {
-        out << "Send graph list by process index: " << process_index
-            << ", serialized size: " << send_count << std::endl;
-
-        global_serialized_graph.resize(global_graph_data_size);
-        req = comm->allgatherv_impl(reinterpret_cast<void*>(my_serialized_graph.data()),
-                                    send_count,
-                                    reinterpret_cast<void*>(global_serialized_graph.data()),
-                                    receive_process_graph_sizes,
-                                    ccl::datatype::int8,
-                                    empty_stream,
-                                    ccl::default_allgatherv_attr,
-                                    {});
-        req.wait();
-    }
-    catch (const std::exception& ex) {
-        out << "Cannot submit global-serialized-graph requests " << ex.what() << std::endl;
-        out << "Memory required for hostnames size: " << global_graph_data_size << " bytes\n";
-        abort();
-    }
-
-    size_t deserialized_bytes = 0;
-    size_t offset_bytes = 0;
-    detail::global_sorted_colored_plain_graphs global_ret;
-
-    out << "Deserialize colored_graph_lists" << std::endl;
-    for (size_t i = 0; i < static_cast<size_t>(comm->size()); i++) {
-        detail::colored_plain_graph_list graph =
-            device_path_deserializer::deserialize_colored_graph_list_indices(
-                global_serialized_graph, deserialized_bytes, offset_bytes);
-        out << "Process index: " << i << ", deserialized bytes: " << deserialized_bytes
-            << ", by offset: " << offset_bytes << std::endl;
-
-        global_ret.emplace(i, std::move(graph));
-    }
-
-    out << "Global colored_graph deserialized on process: " << process_index << std::endl;
-    return global_ret;
-}
-
-detail::global_plain_graphs allied_process_group_ring_topology::merge_allied_nodes_plain_graphs(
-    std::ostream& out,
-    const ccl::cluster_device_indices_type& cluster_indices,
-    size_t process_index,
-    const detail::global_sorted_plain_graphs& cluster_graphs,
-    detail::p2p_rating_function ping) {
-    out << "Merge global graphs from processes: " << cluster_graphs.size() << std::endl;
-    detail::global_plain_graphs ret;
-    for (const auto& host_process_id_pair : cluster_indices) {
-        const ccl::host_id& hostname = host_process_id_pair.first;
-
-        //iterate over all allied processes on the same host
-        const ccl::process_device_indices_type& processes = host_process_id_pair.second;
-        out << "Try to merge graphs for host: " << hostname
-            << ", allied processes count: " << processes.size() << std::endl;
-
-        //collect graphs for all allied processes in lists for merge trying
-        std::list<detail::plain_graph_list> tmp_allied_processes_graphs;
-        for (const auto& process_val : processes) {
-            auto process_id = process_val.first;
-            auto process_graph_list_it = cluster_graphs.find(process_id);
-            if (process_graph_list_it == cluster_graphs.end()) {
-                out << "Cannot find process id: " << process_id << ", for hostname: " << hostname
-                    << ", in cluster graphs\n";
-                std::stringstream ss;
-                ss << out.rdbuf();
-                throw std::runtime_error(std::string("Cannot merge custer graphs. Log:\n") +
-                                         ss.str());
-            }
-            tmp_allied_processes_graphs.emplace_back(process_graph_list_it->second);
-        }
-
-        //merge and set result for all allied processes
-        for (const auto& process_val : processes) {
-            //merge_lists is stable, let's my process graph list at first in merge result
-            std::list<detail::plain_graph_list> rotated = tmp_allied_processes_graphs;
-            /* TODO rotate ? */
-            auto process_index = process_val.first;
-
-            auto new_begin_it = rotated.begin();
-            std::advance(new_begin_it, process_index);
-            std::rotate(rotated.begin(), new_begin_it, rotated.end());
-
-            ret.push_back(
-                std::make_pair(process_val.first, detail::merge_graph_lists_stable(rotated, ping)));
-        }
-
-        out << "graph merged into list, size: " << ret.size() << std::endl;
-    }
-    return ret;
-}
-
-detail::global_colored_plain_graphs
-allied_process_group_ring_topology::merge_allied_nodes_in_colored_plain_graphs(
-    std::ostream& out,
-    const ccl::cluster_device_indices_type& cluster_indices,
-    size_t process_index,
-    size_t process_count,
-    const detail::global_sorted_colored_plain_graphs& cluster_graphs,
-    detail::p2p_rating_function ping) {
-    out << "Merge global colored graphs from processes: " << cluster_graphs.size() << std::endl;
-    detail::global_colored_plain_graphs ret;
-    for (const auto& host_process_id_pair : cluster_indices) {
-        const ccl::host_id& hostname = host_process_id_pair.first;
-
-        //iterate over all allied processes on the same host
-        const ccl::process_device_indices_type& processes = host_process_id_pair.second;
-        out << "Try to merge colored graphs for host: " << hostname
-            << ", allied processes count: " << processes.size() << std::endl;
-
-        //collect graphs for all allied processes in lists for merge trying
-        std::list<detail::colored_plain_graph_list> tmp_allied_processes_graphs;
-
-        size_t terminator_process_index = 0; // TODO LIMITATION on MAX PROCESSES COUNT
-        for (const auto& process_val : processes) {
-            auto process_id = process_val.first;
-            auto process_graph_list_it = cluster_graphs.find(process_id);
-            if (process_graph_list_it == cluster_graphs.end()) {
-                out << "Cannot find process id: " << process_id << ", for hostname: " << hostname
-                    << ", in cluster graphs\n";
-                std::stringstream ss;
-                ss << out.rdbuf();
-
-                assert(false);
-                throw std::runtime_error(std::string("Cannot merge colored custer graphs. Log:\n") +
-                                         ss.str());
-            }
-            tmp_allied_processes_graphs.emplace_back(process_graph_list_it->second);
-
-            terminator_process_index = std::max(process_val.first, terminator_process_index);
-        }
-
-        terminator_process_index++;
-        out << "terminator_process_index: " << terminator_process_index;
-
-        //merge and set result for all allied processes
-        for (const auto& process_val : processes) {
-            //merge_lists is stable, let's my process graph list at first in merge result
-            auto process_index = process_val.first;
-
-            //turn right
-            auto new_begin_it = tmp_allied_processes_graphs.begin();
-            std::advance(new_begin_it, process_index);
-            std::list<detail::colored_plain_graph_list> to_right_part(
-                new_begin_it, tmp_allied_processes_graphs.end());
-
-            //use terminator!
-            if (processes.size() != 1) {
-                if (process_index == processes.size() - 1) {
-                    //set terminator for right side
-                    detail::colored_plain_graph_list terminated_list =
-                        *tmp_allied_processes_graphs.begin();
-                    reset_color(terminated_list, terminator_process_index);
-                    to_right_part.push_back(std::move(terminated_list));
-                }
-            }
-
-            size_t merged_from_right = 0;
-            detail::colored_plain_graph_list to_right =
-                detail::merge_graph_lists_stable_for_process(
-                    to_right_part, ping, true, merged_from_right);
-            if (to_right.empty()) //i am the rightest process
-            {
-                to_right = *new_begin_it;
-            }
-
-            //turn left
-            size_t merged_from_left = 0;
-            auto new_end_it = tmp_allied_processes_graphs.begin();
-            std::advance(new_end_it, process_index + 1);
-            std::list<detail::colored_plain_graph_list> to_left_part(
-                tmp_allied_processes_graphs.begin(), new_end_it);
-            std::reverse(to_left_part.begin(), to_left_part.end());
-            if (to_left_part.empty()) {
-                to_left_part.push_back(to_right);
-            }
-            else {
-                *to_left_part.begin() = to_right;
-            }
-
-            //use terminator!
-            if (processes.size() != 1) {
-                if (process_index == 0) {
-                    //set terminator for right side
-                    detail::colored_plain_graph_list terminated_list =
-                        *tmp_allied_processes_graphs.rbegin();
-                    reset_color(terminated_list, terminator_process_index);
-                    to_left_part.push_back(std::move(terminated_list));
-                }
-            }
-            for (auto& graph : to_left_part) {
-                std::reverse(graph.begin(), graph.end());
-            }
-            *to_left_part.begin() = to_right;
-
-            detail::colored_plain_graph_list to_left_right =
-                detail::merge_graph_lists_stable_for_process(
-                    to_left_part, ping, false, merged_from_left);
-            ret.push_back(std::make_pair(process_val.first, to_left_right));
-        }
-
-        out << "colored graph merged into list, size: " << ret.size() << std::endl;
-    }
-    return ret;
-}
-
-detail::plain_graph_list allied_process_group_ring_topology::resize_merged_graphs_for_process(
-    size_t process_index,
-    const detail::global_plain_graphs& merged_cluster_graphs,
-    const detail::plain_graph_list& original_graph_list,
-    std::ostream& out) {
-    out << "remove foreign chains from my merged graphs for process idx: " << process_index << "\n";
-    auto it =
-        std::find_if(merged_cluster_graphs.begin(),
-                     merged_cluster_graphs.end(),
-                     [process_index](const typename detail::global_plain_graphs::value_type& val) {
-                         return val.first == process_index;
-                     });
-    if (it == merged_cluster_graphs.end()) {
-        out << "Cannot find process: " << process_index
-            << " in merged_cluster_graphs with size: " << merged_cluster_graphs.size() << std::endl;
-        std::stringstream ss;
-        ss << out.rdbuf();
-        assert(false);
-        throw std::runtime_error(std::string("Cannot resize custer graphs. Log:\n") + ss.str());
-    }
-
-    detail::plain_graph_list my_merged_rings_copy = it->second;
-    {
-        size_t new_size = my_merged_rings_copy.size();
-        size_t old_size = original_graph_list.size();
-
-        out << "Check ring sizes, before: " << old_size << ", after: " << new_size << std::endl;
-        if (old_size > new_size) {
-            abort();
-        }
-
-        auto merged_erased_range_it = my_merged_rings_copy.begin();
-        std::advance(merged_erased_range_it, old_size);
-        my_merged_rings_copy.erase(merged_erased_range_it, my_merged_rings_copy.end());
-    }
-    return my_merged_rings_copy;
-}
-
-detail::colored_plain_graph_list
-allied_process_group_ring_topology::resize_merged_colored_graphs_for_process(
-    size_t process_index,
-    size_t process_size,
-    const detail::global_colored_plain_graphs& merged_cluster_graphs,
-    const detail::colored_plain_graph_list& original_graph_list,
-    std::ostream& out) {
-    out << "remove foreign chains from my colored merged graphs for process idx: " << process_index
-        << "\n";
-    auto it = std::find_if(
-        merged_cluster_graphs.begin(),
-        merged_cluster_graphs.end(),
-        [process_index](const typename detail::global_colored_plain_graphs::value_type& val) {
-            return val.first == process_index;
-        });
-    if (it == merged_cluster_graphs.end()) {
-        out << "Cannot find process: " << process_index
-            << " in merged_cluster_graphs with size: " << merged_cluster_graphs.size() << std::endl;
-        std::stringstream ss;
-        ss << out.rdbuf();
-        throw std::runtime_error(std::string("Cannot resize colored custer graphs. Log:\n") +
-                                 ss.str());
-    }
-
-    detail::colored_plain_graph_list my_merged_rings_copy = it->second;
-    {
-        size_t new_size = my_merged_rings_copy.size();
-        size_t old_size = original_graph_list.size();
-
-        out << "Check ring sizes, before: " << old_size << ", after: " << new_size << std::endl;
-        if (old_size > new_size) {
-            abort();
-        }
-
-        auto merged_erased_range_it = my_merged_rings_copy.begin();
-        std::advance(merged_erased_range_it, old_size);
-        my_merged_rings_copy.erase(merged_erased_range_it, my_merged_rings_copy.end());
-    }
-
-    //sort graphs by process id
-    /*
-    for(auto& graph : my_merged_rings_copy)
-    {
-        std::stable_sort(graph.begin(), graph.end(), [process_index, process_size]
-                                                        (const detail::colored_idx& lhs,
-                                                         const detail::colored_idx& rhs)
-        {
-            //size_t right_index = (process_index + 1 ) % process_size;
-            //size_t left_index = ( process_index == 0 ?  process_size : process_index - 1);
-            return (lhs.first < rhs.first); //stable sort by color!
-        });
-    }
-*/
-    return my_merged_rings_copy;
-}
-
-ccl::process_device_indices_type
-allied_process_group_ring_topology::create_scaleout_devices_in_graphs_for_process(
-    size_t process_idx,
-    size_t cluster_size,
-    detail::global_sorted_plain_graphs& cluster_graphs,
-    std::ostream& out) {
-    size_t left_process_idx = (process_idx == 0 ? cluster_size - 1 : process_idx - 1);
-    size_t right_process_idx = ((process_idx + 1) % cluster_size);
-
-    out << "Create scaleout devices for process: (" << process_idx << "/" << cluster_size << ")"
-        << ", left_process_idx: " << left_process_idx
-        << ", right_process_idx: " << right_process_idx << std::endl;
-
-    ccl::process_device_indices_type scaleout_devices;
-    auto me = cluster_graphs.find(process_idx)->second;
-
-    if (process_idx > left_process_idx) {
-        auto lhs = cluster_graphs.find(left_process_idx)->second;
-        auto find_shared_graph_it = std::find(lhs.begin(), lhs.end(), *me.begin());
-        if (find_shared_graph_it == lhs.end()) {
-            const ccl::device_index_type& scaleout = *(lhs.rbegin()->rbegin());
-            out << "scaleout candidate from Lhs: " << scaleout << std::endl;
-            me.insert(me.begin(), { { scaleout } });
-            scaleout_devices[left_process_idx] = { scaleout };
-        }
-    }
-
-    if (process_idx < right_process_idx) {
-        auto rhs = cluster_graphs.find(right_process_idx)->second;
-        auto find_shared_graph_it = std::find(rhs.begin(), rhs.end(), *me.rbegin());
-        if (find_shared_graph_it == rhs.end()) {
-            const ccl::device_index_type& scaleout = *(rhs.begin()->begin());
-            out << "scaleout candidate from Rhs: " << scaleout << std::endl;
-            me.insert(me.end(), { { scaleout } });
-            scaleout_devices[right_process_idx] = { scaleout };
-        }
-    }
-
-    return scaleout_devices;
-}
-
-ccl::process_device_indices_type
-allied_process_group_ring_topology::create_scaleout_devices_in_colored_graphs_for_process(
-    size_t process_idx,
-    size_t cluster_size,
-    detail::global_sorted_colored_plain_graphs& cluster_graphs,
-    detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
-    std::ostream& out)
-
-{
-    using optional_process = std::pair<bool, size_t>;
-
-    optional_process left_process_idx =
-        std::make_pair(true, (process_idx == 0 ? cluster_size - 1 : process_idx - 1));
-    optional_process right_process_idx = std::make_pair(true, ((process_idx + 1) % cluster_size));
-
-    out << "Create scaleout devices for process: (" << process_idx << "/" << cluster_size << ")"
-        << ", left_process_idx: " << left_process_idx.second
-        << ", right_process_idx: " << right_process_idx.second << std::endl;
-
-    ccl::process_device_indices_type scaleout_devices;
-    // process corner cases
-    if (left_process_idx == right_process_idx) {
-        //two processes
-        if (process_idx > left_process_idx.second) {
-            left_process_idx.first = false; //do not process left
-        }
-        else {
-            right_process_idx.first = false; //do not process right
-        }
-    }
-
-    if (left_process_idx.second == process_idx and process_idx == right_process_idx.second) {
-        return scaleout_devices; //nothing to scaleout
-    }
-
-    auto& me = cluster_graphs.find(process_idx)->second;
-
-    std::unique_ptr<size_t> color_to_find(new size_t);
-    auto find_in_list_by_color =
-        [&color_to_find](const detail::colored_plain_graph& graph) -> bool {
-        auto it = std::find_if(
-            graph.begin(), graph.end(), [&color_to_find](const detail::colored_idx& idx) {
-                return (idx.color == *color_to_find);
-            });
-        return it != graph.end();
-    };
-
-    if (left_process_idx.first) {
-        // find lhs in my graphs
-        *color_to_find = left_process_idx.second;
-        if (process_idx == 0) {
-            //use terminate
-            *color_to_find = cluster_size;
-        }
-
-        if (std::find_if(me.begin(), me.end(), find_in_list_by_color) == me.end()) {
-            //add scaleout device
-            auto lhs_it = initial_cluster_graphs.find(left_process_idx.second);
-            if (lhs_it == initial_cluster_graphs.end()) {
-                assert(false && "lhs process doesn't exist");
-                throw std::runtime_error(std::string(__FUNCTION__) +
-                                         " - invalid cluster_graph: " + "no process by id: " +
-                                         std::to_string(left_process_idx.second));
-            }
-
-            const auto& lhs = lhs_it->second;
-            if (lhs.empty()) {
-                assert(false && "lhs process graph is empty ");
-                throw std::runtime_error(
-                    std::string(__FUNCTION__) + " - invalid cluster_graph: empty list " +
-                    "for process by id: " + std::to_string(left_process_idx.second));
-            }
-            const ccl::device_index_type& scaleout = (lhs.rbegin()->rbegin())->index;
-            out << "scaleout candidate from Lhs: " << scaleout << std::endl;
-            //me.insert(me.begin(), { {left_process_idx.second, scaleout}});
-            scaleout_devices[left_process_idx.second] = { scaleout };
-        }
-    }
-
-    if (right_process_idx.first) {
-        // find rhs in my graphs
-        *color_to_find = right_process_idx.second;
-        if (process_idx == cluster_size - 1) {
-            //use terminate
-            *color_to_find = cluster_size;
-        }
-
-        if (std::find_if(me.begin(), me.end(), find_in_list_by_color) == me.end()) {
-            //add scaleout device
-            auto rhs_it = initial_cluster_graphs.find(right_process_idx.second);
-            if (rhs_it == initial_cluster_graphs.end()) {
-                assert(false && "rhs process doesn't exist");
-                throw std::runtime_error(std::string(__FUNCTION__) +
-                                         " - invalid cluster_graph: " + "no process by id: " +
-                                         std::to_string(right_process_idx.second));
-            }
-
-            const auto& rhs = rhs_it->second;
-            if (rhs.empty()) {
-                assert(false && "rhs process graph is empty ");
-                throw std::runtime_error(
-                    std::string(__FUNCTION__) + " - invalid cluster_graph: empty list " +
-                    "for process by id: " + std::to_string(right_process_idx.second));
-            }
-            const ccl::device_index_type& scaleout = (rhs.begin()->begin())->index;
-            out << "scaleout candidate from Lhs: " << scaleout << std::endl;
-            //me.insert(me.end(), {{right_process_idx.second, scaleout}});
-            scaleout_devices[right_process_idx.second] = { scaleout };
-        }
-    }
-
-    return scaleout_devices;
-}
-
-ccl::process_device_indices_type
-allied_process_group_ring_topology::create_ipc_devices_in_colored_graphs_for_process(
-    size_t process_idx,
-    size_t cluster_size,
-    detail::global_sorted_colored_plain_graphs& cluster_graphs,
-    detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
-    std::ostream& out) {
-    (void)initial_cluster_graphs;
-
-    using optional_process = std::pair<bool, size_t>;
-
-    optional_process left_process_idx =
-        std::make_pair(true, (process_idx == 0 ? cluster_size /* - 1 */ : process_idx - 1));
-    optional_process right_process_idx =
-        std::make_pair(true, process_idx + 1 /*((process_idx + 1) % cluster_size)*/);
-
-    out << "Create IPC devices for process: (" << process_idx << "/" << cluster_size << ")"
-        << ", left_process_idx: " << left_process_idx.second
-        << ", right_process_idx: " << right_process_idx.second << std::endl;
-
-    ccl::process_device_indices_type ipc_devices;
-    // process corner cases
-    /*
-    if (left_process_idx == right_process_idx) {
-        //two processes
-        if (process_idx > left_process_idx.second) {
-            left_process_idx.first = false; //do not process left
-        }
-        else {
-            right_process_idx.first = false; //do not process right
-        }
-    }
-    */
-    if (left_process_idx.second == process_idx and process_idx == right_process_idx.second) {
-        return ipc_devices; //nothing to ipc
-    }
-
-    auto& me = cluster_graphs.find(process_idx)->second;
-
-    std::unique_ptr<size_t> color_to_find(new size_t);
-    std::vector<detail::colored_idx> devices_to_remember;
-
-    //TODO limitation: all graphs ipc devices would be merged into one vector
-    auto filter_list_by_color =
-        [&color_to_find, &devices_to_remember](const detail::colored_plain_graph& graph) -> void {
-        std::copy_if(graph.begin(),
-                     graph.end(),
-                     std::back_inserter(devices_to_remember),
-                     [&color_to_find](const detail::colored_idx& idx) {
-                         return (idx.color == *color_to_find);
-                     });
-    };
-
-    if (left_process_idx.first) {
-        // find lhs color in my graphs
-        *color_to_find = left_process_idx.second;
-        devices_to_remember.clear();
-        if (process_idx == 0) {
-            //use terminate
-            *color_to_find = cluster_size;
-        }
-
-        //fill ipc devices candidates in devices_to_remember
-        std::for_each(me.begin(), me.end(), filter_list_by_color);
-        if (!devices_to_remember.empty()) {
-            const ccl::device_index_type& ipc = devices_to_remember.rbegin()->index;
-            out << "ipc candidate from LHS: " << ipc << ", color: " << left_process_idx.second
-                << std::endl;
-            ipc_devices[left_process_idx.second] = { ipc };
-        }
-    }
-
-    if (right_process_idx.first) {
-        // find rhs in my graphs
-        *color_to_find = right_process_idx.second;
-        devices_to_remember.clear();
-        if (process_idx == cluster_size - 1) {
-            //use terminate
-            *color_to_find = cluster_size;
-        }
-
-        //fill ipc devices candidates in devices_to_remember
-        std::for_each(me.begin(), me.end(), filter_list_by_color);
-        if (!devices_to_remember.empty()) {
-            const ccl::device_index_type& ipc = devices_to_remember.begin()->index;
-            out << "ipc candidate from RHS: " << ipc << ", color: " << right_process_idx.second
-                << std::endl;
-            ipc_devices[right_process_idx.second] = { ipc };
-        }
-    }
-
-    return ipc_devices;
-}
-
-// Well tested topology creator
-bool allied_process_group_ring_topology::build_specific_colored(
-    std::ostream& out,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const ccl::process_device_indices_type& ipc_device_indices,
-    detail::colored_plain_graph& id_ring,
-    const std::map<size_t, size_t>& process_device_rank_offset) {
-    //continuous ring, without scale-up devices
-    //processes connected using IPC devices
-    //Rank = Index
-    constexpr ccl::device_topology_type topology_type = ccl::device_topology_type::ring;
-
-    out << "Start building topology: " << ::to_string(topology_type) << ", for colored graph:\n"
-        << detail::to_string(id_ring) << std::endl;
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-
-    // get in-process devices rank offset in cluster map
-    auto offset_it = process_device_rank_offset.find(process_index);
-    if (offset_it == process_device_rank_offset.end()) {
-        assert(false && "");
-    }
-
-    size_t device_rank_offset = offset_it->second;
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // let's start IPC devices search & creation
-    // TODO
-    // We need upgrade algo for detection IPC destination devices, which belong to specific thread in process
-    // Currently the final thread in list owns IPC device
-    /////////////////////////////////////////////////////////////////////////////////////////////////////
-    out << "global rank offset: " << device_rank_offset << std::endl;
-    auto& ctx_per_thread_data = context.process_device_topology;
-    auto topology_comm_addr = ctx_comm_addr;
-    topology_comm_addr.comm_size = device_cluster_size;
-
-    // remember ring first position, which has not termination color, but actual process id
-    // It's because after merge id_rings routine we got merged id_rings
-    // `merged` means:
-    // from left side of list we have IPC devices for left process relation to current process
-    // from right side of list we also have IPC devices for right process relation to current process
-    // SO, ir_rings starts NOT from existing process devices
-    auto local_proc_ring_start =
-        std::find_if(id_ring.begin(), id_ring.end(), [this](native::detail::colored_idx& val) {
-            //return (val.color != process_count); / / first not terminator index
-            return (val.color == process_index); // first not terminator index
-        });
-    auto id_ring_begin = id_ring.begin();
-    size_t distance = std::distance(id_ring_begin, local_proc_ring_start);
-
-    LOG_DEBUG("apply index builder for local thread context, threads count: ",
-              ctx_per_thread_data.size(),
-              ", process indices ring offset: ",
-              distance);
-    for (auto per_thread_it = ctx_per_thread_data.begin();
-         per_thread_it != ctx_per_thread_data.end();
-         ++per_thread_it) {
-        size_t thread_id = per_thread_it->first; // first
-        const auto& thread_dev_indices = per_thread_device_indices.find(thread_id)->second;
-
-        /**Initialize empty topologies**/
-        if (context.get_process_topology<topology_type>(process_index, thread_id)
-                .closed_rings.empty()) {
-            context.get_process_topology<topology_type>(process_index, thread_id)
-                .set_topology(
-                    std::make_shared<device_community<topology_type>>(topology_comm_addr));
-        }
-
-        // Get reference on OUT-enumerated devices array
-        auto& out_indexed_devices =
-            context.get_process_topology<topology_type>(process_index,
-                                                        thread_id)
-                .get_topology()
-                ->get_device_storage(); // just second
-
-        // Get IN-non-enumerated devices for current thread
-        std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-            devices.thread_gpu_comms.find(thread_id)->second;
-
-        //allocate IPC devices pool(if needed)
-        LOG_DEBUG("LIMITATION: Allocate IPC pool for the LAST thread: ",
-                  thread_id,
-                  ", ipc_device_indices cound: ",
-                  ipc_device_indices.size());
-        detail::cluster_ipc_devices_pool ipc_comms;
-        if (thread_id ==
-            ctx_per_thread_data.size() - 1) //TODO only final thread owns IPC devies at now
-        {
-            ipc_comms =
-                detail::create_filtered_ipc_destination_gpu_comms<group_id(), topology_type>(
-                    id_ring,
-                    ipc_device_indices,
-                    process_index,
-                    process_count,
-                    context,
-                    devices,
-                    *non_indexed_plain_devices);
-        }
-        LOG_DEBUG("Create indexer builder for process index: ",
-                  process_index,
-                  ", process_count: ",
-                  process_count,
-                  ", device rank offset: ",
-                  device_rank_offset,
-                  ", thread devices: ",
-                  thread_dev_indices.size(),
-                  ", ipc_device_indices count: ",
-                  ipc_device_indices.size());
-
-        // Rank builder operaes on IN-non-enumarated devices:
-        // 1) find its position (using ID comparison & color) in id_ring
-        // 2) calculate operational rank as position offset  in id_ring
-        // 3) calcuate operations size as whole id_ring size
-        // 4) adjust ranks & size to actual using `device_rank_offset` specifically for process
-        // 5) put enumerated device into OUT-enumerated devices array `out_indexed_devices`
-        auto rank_builder = create_device_functor<
-            detail::smart_ring_indexer<group_id(), topology_type, process_group_context>>(
-            id_ring,
-            process_index,
-            process_count,
-            device_rank_offset,
-            ipc_comms.size()
-                ? 1
-                : 0, /* TODO self closed ring - only one id_ring for all:   prev_proc: me_proc: next_proc   - prev = next, exclude one*/
-            devices,
-            out_indexed_devices,
-            ipc_device_indices,
-            ccl::process_device_indices_type{},
-            local_proc_ring_start,
-            context);
-        //start indexer
-        ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-        detail::printer<group_id(), topology_type> p;
-        ccl_tuple_for_each(out_indexed_devices, p);
-
-        {
-            std::stringstream ss;
-            ss << "Indexer result for devices in thread idx (" << thread_id << "/"
-               << ctx_per_thread_data.size() << "):\n"
-               << p.to_string() << std::endl;
-            const std::string& str = ss.str();
-            LOG_DEBUG(str);
-            out << str;
-        }
-    }
-
-    out << "\nStart ring builder" << std::endl;
-    LOG_DEBUG("Start ring builder for threads: ", ctx_per_thread_data.size());
-    for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-         current_thread_idx++) {
-        // find max rank in current thread device list
-        auto& indexed_devices_for_current_thread =
-            context.get_process_topology<topology_type>(process_index, current_thread_idx)
-                .get_topology()
-                ->get_device_storage();
-        const auto& curr_real =
-            detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), topology_type>(
-                indexed_devices_for_current_thread, id_ring);
-        const auto& curr_virt =
-            detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), topology_type>(
-                indexed_devices_for_current_thread, id_ring);
-
-        size_t tg_max_rank = std::max({ std::get<0>(curr_real), std::get<0>(curr_virt) });
-
-        // find thread, which will connect to current thread max rank with next_rank
-        size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-
-        {
-            std::stringstream ss;
-
-            ss << "Current thread: " << current_thread_idx
-               << ", max rank candidates: " << std::get<0>(curr_real) << ", "
-               << std::get<0>(curr_virt) << ", selected max rank: " << tg_max_rank
-               << ", expected next_rank: " << next_rank << std::endl;
-            const std::string str = ss.str();
-            LOG_DEBUG(str);
-            out << str;
-        }
-
-        //Find in local threads at first
-        bool find_in_current_process = false;
-        for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-             next_thread_id++) {
-            if (next_thread_id == current_thread_idx) {
-                // wrong thread, get next
-                continue;
-            }
-
-            // search next_rank in that thread
-            auto& next_thread_ring_topology =
-                context.get_process_topology<topology_type>(process_index, next_thread_id)
-                    .get_topology()
-                    ->get_device_storage();
-            const auto& real =
-                detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), topology_type>(
-                    next_thread_ring_topology, id_ring);
-            const auto& virt =
-                detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), topology_type>(
-                    next_thread_ring_topology, id_ring);
-
-            if (next_rank != std::min({ std::get<0>(real), std::get<0>(virt) })) {
-                // wrong thread, get next
-                continue;
-            }
-
-            {
-                std::stringstream ss;
-                ss << "next thread: " << next_thread_id
-                   << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                   << std::endl;
-
-                const std::string str = ss.str();
-                LOG_DEBUG(str);
-                out << str;
-            }
-
-            find_in_current_process = true;
-            out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                << ")" << std::endl;
-            if (next_rank == std::get<0>(real)) {
-                auto locker =
-                    detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), topology_type>(
-                        next_rank, 0, real, devices, indexed_devices_for_current_thread);
-                out << "Added real locker by index: " << next_rank
-                    << ", for thread idx: " << current_thread_idx << ":\n"
-                    << locker->to_string() << std::endl;
-            }
-            else if (next_rank == std::get<0>(virt)) {
-                auto locker = detail::
-                    add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), topology_type>(
-                        next_rank, 0, virt, devices, indexed_devices_for_current_thread);
-                out << "Added virtual locker by index: " << next_rank
-                    << ", for thread idx: " << current_thread_idx << ":\n"
-                    << locker->to_string() << std::endl;
-            }
-            else {
-                assert(false && "unknown device type");
-                std::ostringstream ss;
-                ss << out.rdbuf();
-                throw std::runtime_error(std::string(__FUNCTION__) +
-                                         " - unknown device type. Log:\n" + ss.str());
-            }
-        }
-
-        /*-S-
-        if (!find_in_current_process)
-        {
-            abort();
-        }*/
-        (void)find_in_current_process;
-        /*//if not find in process local threads - use IPC to find
-        if (!find_in_current_process and !ipc_comms.empty())
-        {
-            out << "Find IPC device\n";
-            bool find = false;
-            for (const auto& process_ipc_comms : ipc_comms)
-            {
-                indexed_device_container<ccl_ipc_gpu_comm>& curr_locker_map =
-                        std::get<ccl_ipc_gpu_comm::type_idx()>(*indexed_devices_for_current_thread);
-                auto ipc_it = process_ipc_comms.second.find(next_rank);
-                if(ipc_it == process_ipc_comms.second.end())
-                {
-                    out << "skip process index: " << process_ipc_comms.first << std::endl;
-                    continue;
-                }
-                find = true;
-                out << "Lock IPC ring for threads (" << current_thread_idx << " <-> xxx\")" << std::endl;
-                const auto& comm_addr = ipc_it->second->template get_comm_data<type(), topology_type>();
-                curr_locker_map.insert({comm_addr.rank, ipc_it->second});
-                out << "Added locker for thread idx: " << current_thread_idx  <<":\n" << ipc_it->second->to_string() << std::endl;
-            }
-            if (!find)
-            {
-                std::stringstream ss;
-                ss << out.rdbuf();
-                std::cerr << "Cannot find IPC deice by rank: " << next_rank << "\nPrevious log:\n" << ss.str() <<"\nAbort Program" << std::endl;
-                abort();
-            }
-            //upgrade left gpu device to IPC SOURCE type
-            if (!ipc_comms.empty()/ *has another IPC Device* / and current_thread_idx == 0 / * left comm is IPC comm for last process* / )
-            {
-                const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, type(), topology_type>(*indexed_devices_for_current_thread, id_ring);
-                const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, type(), topology_type>(*indexed_devices_for_current_thread, id_ring);
-                size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
-                out << "Upgrade thread id: " << current_thread_idx
-                    << " GPU by rank: " << left_ipc_source_rank
-                    << " to IPC SOURCE GPU" << std::endl;
-                if(left_ipc_source_rank == std::get<0>(real))
-                {
-                    auto locker =
-                            detail::add_ipc_source_locker_device<ccl_gpu_comm,
-                                                                  type(), topology_type>(next_rank,
-                                                                                 0,
-                                                                                 real,
-                                                                                 devices,
-                                                                                 *indexed_devices_for_current_thread);
-                    out << "Upgrage REAL to IPC_REAL_SOURCE locker by rank: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (left_ipc_source_rank == std::get<0>(virt))
-                {
-                    auto locker =
-                            detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
-                                                                  type(), topology_type>(next_rank,
-                                                                                 0,
-                                                                                 virt,
-                                                                                 devices,
-                                                                                 *indexed_devices_for_current_thread);
-                    out << "Upgrage VIRTUAL to IPC_VIRT_SOURCE locker by rank: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-            }
-        }
-        */
-    }
-
-    {
-        //print topology
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            const auto& indexed_devices_for_current_thread =
-                context.get_process_topology<topology_type>(process_index, current_thread_idx)
-                    .get_topology()
-                    ->get_device_storage();
-
-            detail::printer<group_id(), topology_type> p;
-            ccl_tuple_for_each(indexed_devices_for_current_thread, p);
-            std::stringstream ss;
-            ss << "Builder result for devices in thread idx (" << current_thread_idx << "/"
-               << ctx_per_thread_data.size() << "):\n"
-               << p.to_string() << std::endl;
-            const std::string& str = ss.str();
-            LOG_DEBUG(str);
-            out << str;
-        }
-    }
-    return true;
-}
-
-bool allied_process_group_ring_topology::build_specific_scale_up(
-    std::ostream& out,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const ccl::process_device_indices_type& ipc_device_indices,
-    detail::colored_plain_graph_list& graph_list,
-    const std::map<size_t, size_t>& process_device_rank_offset) {
-    constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
-
-    out << "Start building topology: " << ::to_string(group_id())
-        << ", for colored graphs: " << graph_list.size() << "\n"
-        << detail::to_string(graph_list) << std::endl;
-
-    auto& ctx_per_thread_data = context.process_device_topology;
-    out << "\nStart gpu comm transformation scaling role for graph list count: "
-        << graph_list.size() << std::endl;
-    std::set<ccl::device_index_type> created_scaleup_indices;
-
-    // allocate IPC devices pool(by demand)
-    detail::cluster_ipc_devices_pool ipc_comms;
-    size_t ring_index = 0;
-
-    // let's start scaling devices search & creation
-    for (auto id_ring_it = graph_list.begin(); id_ring_it != graph_list.end(); ++id_ring_it) {
-        const auto& id_ring = *id_ring_it;
-        for (const auto& per_thread : per_thread_device_indices) {
-            size_t thread_id = per_thread.first;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices.thread_gpu_comms.find(thread_id)->second;
-
-            // create device comm wrappers
-            // 1) upgrade last devices in list up to scaling proxy type: numa
-            auto last_graph_item = id_ring.rbegin();
-            for (; last_graph_item != id_ring.rend(); ++last_graph_item) {
-                detail::color_t process = last_graph_item->color;
-                ccl::device_index_type last_in_graph_index = last_graph_item->index;
-
-                if (process != process_index) {
-                    out << "thread: " << thread_id
-                        << " detect device wit foreign color: " << *last_graph_item << std::endl;
-                    continue;
-                }
-                if (per_thread.second.find(last_in_graph_index) != per_thread.second.end()) {
-                    out << "thread: " << thread_id
-                        << " wants to create scaling device by idx: " << last_in_graph_index
-                        << std::endl;
-                    if (created_scaleup_indices.find(last_in_graph_index) !=
-                        created_scaleup_indices.end()) {
-                        out << "skip existing scaling device candidate by: " << last_in_graph_index
-                            << std::endl;
-                        continue;
-                    }
-
-                    size_t inserted_device_type_index = detail::role_mod::inject_numa_device<
-                        group_id(),
-                        class_id,
-                        process_group_context,
-                        ccl_virtual_gpu_comm, /* `virtual` is better candiate */
-                        ccl_gpu_comm>(
-                        *non_indexed_plain_devices, last_in_graph_index, context, devices);
-                    if (inserted_device_type_index == std::numeric_limits<size_t>::max()) {
-                        assert(false && "Unsupported device type in topology creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(
-                            std::string("Unsupported device type in topology creation. Log:\n") +
-                            ss.str());
-                    }
-
-                    out << "Inject numa device by order: " << inserted_device_type_index
-                        << "\nby idx: " << last_in_graph_index << std::endl;
-                    created_scaleup_indices.insert(last_in_graph_index);
-
-                    break;
-                }
-            }
-
-            //2) create IPC wrappers
-            //TODO THE last id_ring from graph_list AND the last thread should process id_ring for IPC device creation here
-            //BUT we cannot determine 'last' ring for 'last thread' here in pretty way.
-            //So We need to extend 'color' by process_id and thread_id together instead process_id single one
-            if (std::next(id_ring_it, 1) == graph_list.end()) {
-                if (thread_id ==
-                    ctx_per_thread_data.size() - 1) //TODO only final thread owns IPC devies at now
-                {
-                    ipc_comms =
-                        detail::create_filtered_ipc_destination_gpu_comms<group_id(), class_id>(
-                            *id_ring_it,
-                            ipc_device_indices,
-                            process_index,
-                            process_count,
-                            context,
-                            devices,
-                            *non_indexed_plain_devices);
-                }
-            }
-        }
-    }
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-    size_t accumulated_index_offset_for_graph = 0;
-    size_t graph_num = 0;
-    std::map<size_t /*graph_num*/, size_t /*offset*/> index_offset_for_graphs;
-    auto offset_it = process_device_rank_offset.find(process_index);
-    if (offset_it == process_device_rank_offset.end()) {
-        assert(false && "");
-    }
-
-    accumulated_index_offset_for_graph = offset_it->second;
-    auto topology_comm_addr = ctx_comm_addr;
-    topology_comm_addr.comm_size = device_cluster_size;
-
-    out << "global rank offset: " << accumulated_index_offset_for_graph << std::endl;
-
-    for (auto& id_ring : graph_list) {
-        auto local_proc_ring_start =
-            std::find_if(id_ring.begin(), id_ring.end(), [this](native::detail::colored_idx& val) {
-                //return (val.color != process_count && val.color != native::detail::marked_color); / / first not terminator index
-                return val.color == process_index;
-            });
-
-        if (local_proc_ring_start == id_ring.end()) {
-            out << "graph fully processes: " << detail::to_string(id_ring) << ", take next"
-                << std::endl;
-            continue;
-        }
-
-        size_t index_offset = accumulated_index_offset_for_graph;
-        for (auto per_thread_it = ctx_per_thread_data.begin();
-             per_thread_it != ctx_per_thread_data.end();
-             ++per_thread_it) {
-            size_t thread_id = per_thread_it->first; //first
-
-            /** Initialize empty context**/
-            std::shared_ptr<device_community<class_id>> out_indexed_devices;
-            if (graph_list.size() == 1) {
-                if (context.get_process_topology<class_id>(process_index, thread_id)
-                        .closed_rings.empty()) {
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .set_topology(
-                            std::make_shared<device_community<class_id>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .get_topology(ring_index);
-            }
-            else {
-                if (context.get_process_topology<class_id>(process_index, thread_id)
-                        .torn_apart_rings.empty()) {
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .set_additiona_topology(
-                            std::make_shared<device_community<class_id>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<class_id>(process_index, thread_id)
-                        .get_additiona_topology(ring_index);
-            }
-
-            out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
-                << ", index offset: " << index_offset << std::endl;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices.thread_gpu_comms.find(thread_id)->second;
-
-            auto rank_builder = create_device_functor<
-                detail::smart_ring_indexer<group_id(), class_id, process_group_context>>(
-                id_ring,
-                process_index,
-                process_count,
-                index_offset,
-                0,
-                devices,
-                out_indexed_devices->get_device_storage(),
-                ipc_device_indices,
-                ccl::process_device_indices_type{},
-                local_proc_ring_start,
-                context);
-
-            ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
-            out << "Indexer result for devices in thread idx (" << thread_id << "/"
-                << ctx_per_thread_data.size() << "):\n"
-                << p.to_string() << std::endl;
-
-            accumulated_index_offset_for_graph +=
-                rank_builder.get_functor().get_marked_indices_count();
-            out << "\nIndexer for graph num: " << graph_num
-                << ", finished. imarked_indices: " << accumulated_index_offset_for_graph << "\n";
-        }
-        index_offset_for_graphs[graph_num] = index_offset;
-        graph_num++;
-    }
-
-    out << "Created IPC devices for processes: " << ipc_comms.size()
-        << ", for cluster_size: " << device_cluster_size
-        << ", with device_cluster_rank_offset: " << device_cluster_rank_offset << "\n";
-    for (const auto& process_ipc : ipc_comms) {
-        out << "prx: " << process_ipc.first << std::endl;
-        for (const auto& ipc : process_ipc.second) {
-            out << "{ rank: " << ipc.first << ", comm: " << ipc.second->to_string() << "}\n";
-        }
-        out << std::endl;
-    }
-
-    out << "\nStart ring builder for graphs count: " << graph_list.size() << std::endl;
-    graph_num = 0;
-    for (const auto& id_ring : graph_list) {
-        out << "\nStart ring builder for graph num: " << graph_num << std::endl;
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            // find max rank in current thread device list
-            std::shared_ptr<device_community<class_id>> indexed_topology;
-            if (graph_list.size() == 1) {
-                indexed_topology =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_topology(ring_index);
-            }
-            else {
-                indexed_topology =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_additiona_topology(ring_index);
-            }
-
-            auto& indexed_devices_for_current_thread = indexed_topology->get_device_storage();
-            const auto& curr_real =
-                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_virt =
-                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = detail::
-                get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_virt =
-                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                 group_id(),
-                                                 class_id>(indexed_devices_for_current_thread,
-                                                           id_ring);
-
-            size_t tg_max_rank = std::max({ std::get<0>(curr_real),
-                                            std::get<0>(curr_virt),
-                                            std::get<0>(curr_scale_real),
-                                            std::get<0>(curr_scale_virt) });
-
-            // find thread, which will connect to current thread max rank with next_rank
-            size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-            out << "Current thread: " << current_thread_idx
-                << ", max rank candidates: " << std::get<0>(curr_real) << ", "
-                << std::get<0>(curr_virt) << ", " << std::get<0>(curr_scale_real) << ", "
-                << std::get<0>(curr_scale_virt) << ", selected max rank: " << tg_max_rank
-                << ", expected next_rank: " << next_rank << std::endl;
-
-            //Find in local threads at first
-            for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-                 next_thread_id++) {
-                if (next_thread_id == current_thread_idx) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                // search next_rank in that thread
-                std::shared_ptr<device_community<class_id>> next_indexed_topology;
-                if (graph_list.size() == 1) {
-                    next_indexed_topology =
-                        context.get_process_topology<class_id>(process_index, next_thread_id)
-                            .get_topology(ring_index);
-                }
-                else {
-                    next_indexed_topology =
-                        context.get_process_topology<class_id>(process_index, next_thread_id)
-                            .get_additiona_topology(ring_index);
-                }
-
-                auto& next_thread_ring_topology = next_indexed_topology->get_device_storage();
-                const auto& real =
-                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& virt =
-                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& scale_real =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                const auto& scale_virt =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                if (next_rank != std::min({ std::get<0>(real),
-                                            std::get<0>(virt),
-                                            std::get<0>(scale_real),
-                                            std::get<0>(scale_virt) })) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                out << "next thread: " << next_thread_id
-                    << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                    << ", " << std::get<0>(scale_real) << ", " << std::get<0>(scale_virt)
-                    << std::endl;
-
-                out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                    << ")" << std::endl;
-
-                if (next_rank == std::get<0>(real)) {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
-                            next_rank, 0, real, devices, indexed_devices_for_current_thread);
-                    out << "Added real locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(virt)) {
-                    auto locker = detail::
-                        add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                            next_rank, 0, virt, devices, indexed_devices_for_current_thread);
-                    out << "Added virtual locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_real)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup  real proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_virt)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup virtual proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                /*else
-                {
-                    assert(false && "unknown device type");
-                    std::ostringstream ss;
-                    ss << out.rdbuf();
-                    throw std::runtime_error(std::string(__FUNCTION__) + " - unknown device type. Log:\n" +
-                                             ss.str());
-                }*/
-            }
-
-            //if not find in process local threads - use IPC to find
-            /*if (!find_in_current_process and !ipc_comms.empty())
-            {
-                out << "Find IPC device\n";
-                bool find = false;
-                for (const auto& process_ipc_comms : ipc_comms)
-                {
-                    indexed_device_container<ccl_ipc_gpu_comm>& curr_locker_map =
-                            std::get<ccl_ipc_gpu_comm::type_idx()>(indexed_devices_for_current_thread);
-
-                    auto ipc_it = process_ipc_comms.second.find(next_rank);
-                    if(ipc_it == process_ipc_comms.second.end())
-                    {
-                        out << "skip process index: " << process_ipc_comms.first << std::endl;
-                        continue;
-                    }
-                    find = true;
-                    out << "Lock IPC ring for threads (" << current_thread_idx << " <-> xxx\")" << std::endl;
-                    const auto& comm_addr = ipc_it->second->template get_comm_data<type(), group_id()>();
-                    curr_locker_map.insert({comm_addr.rank, ipc_it->second});
-                    out << "Added locker for thread idx: " << current_thread_idx  <<":\n" << ipc_it->second->to_string() << std::endl;
-                }
-
-                if (!find)
-                {
-                    std::stringstream ss;
-                    ss << out.rdbuf();
-                    std::cerr << "Cannot find IPC deice by rank: " << next_rank << "\nPrevious log:\n" << ss.str() <<"\nAbort Program" << std::endl;
-                    abort();
-                }
-
-                //upgrade left gpu device to IPC SOURCE type
-                if ( current_thread_idx == 0 / * left comm is IPC comm for last process* / )
-                {
-                    const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, type(), group_id()>(indexed_devices_for_current_thread, id_ring);
-                    const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, type(), group_id()>(indexed_devices_for_current_thread, id_ring);
-
-                    size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
-                    out << "Upgrade thread id: " << current_thread_idx
-                        << " GPU by rank: " << left_ipc_source_rank
-                        << " to IPC SOURCE GPU" << std::endl;
-
-                    if(left_ipc_source_rank == std::get<0>(real))
-                    {
-                        auto locker =
-                                    detail::add_ipc_source_locker_device<ccl_gpu_comm,
-                                                                        type(), group_id()>(next_rank,
-                                                                                   0,
-                                                                                   real,
-                                                                                   devices,indexed_devices_for_current_thread);
-                        out << "Upgrage REAL to IPC_REAL_SOURCE locker by rank: " << next_rank
-                            << ", for thread idx: " << current_thread_idx  <<":\n"
-                            << locker->to_string() << std::endl;
-                    }
-                    else if (left_ipc_source_rank == std::get<0>(virt))
-                    {
-                        auto locker =
-                                detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
-                                                                  type(), group_id()>(next_rank,
-                                                                                 0,
-                                                                                 virt,
-                                                                                 devices,indexed_devices_for_current_thread);
-                        out << "Upgrage VIRTUAL to IPC_VIRT_SOURCE locker by rank: " << next_rank
-                            << ", for thread idx: " << current_thread_idx  <<":\n"
-                            << locker->to_string() << std::endl;
-                    }
-                }
-            }
-            */
-        }
-        graph_num++;
-    }
-
-    {
-        //print topology
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            std::shared_ptr<device_community<class_id>> indexed_topology;
-            if (graph_list.size() == 1) {
-                indexed_topology =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_topology(ring_index);
-            }
-            else {
-                indexed_topology =
-                    context.get_process_topology<class_id>(process_index, current_thread_idx)
-                        .get_additiona_topology(ring_index);
-            }
-
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(indexed_topology->get_device_storage(), p);
-            std::stringstream ss;
-            ss << "Builder result for devices in thread idx (" << current_thread_idx << "/"
-               << ctx_per_thread_data.size() << "):\n"
-               << p.to_string() << std::endl;
-            const std::string& str = ss.str();
-            LOG_DEBUG(str);
-            out << str;
-        }
-    }
-    return true;
-}
-
-bool allied_process_group_ring_topology::build_specific_scale_out_only(
-    std::ostream& out,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const ccl::process_device_indices_type& scaleout_device_indices,
-    detail::colored_plain_graph_list& graph_list,
-    const std::map<size_t, size_t>& process_device_rank_offset) {
-    constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
-
-    out << "Start building topology: " << ::to_string(group_id())
-        << ", for colored graphs: " << graph_list.size() << "\n"
-        << detail::to_string(graph_list) << std::endl;
-
-    auto& ctx_per_thread_data = context.process_device_topology;
-    out << "\nStart gpu comm transformation scaling role for graph list count: "
-        << graph_list.size() << std::endl;
-    std::set<ccl::device_index_type> created_numa_indices;
-    size_t ring_index = 0;
-
-    // let's start scaling devices search & creation
-    for (auto id_ring_it = graph_list.begin(); id_ring_it != graph_list.end(); ++id_ring_it) {
-        const auto& id_ring = *id_ring_it;
-        for (const auto& per_thread : per_thread_device_indices) {
-            size_t thread_id = per_thread.first;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices.thread_gpu_comms.find(thread_id)->second;
-
-            // create device comm wrappers
-            // 1) upgrade last devices in list up to scaling proxy type: numa
-            if (graph_list.size() == 1) {
-                //no numa for single graph
-                break;
-            }
-
-            auto last_graph_item = id_ring.rbegin();
-            for (; last_graph_item != id_ring.rend(); ++last_graph_item) {
-                detail::color_t process = last_graph_item->color;
-                ccl::device_index_type last_in_graph_index = last_graph_item->index;
-
-                if (process != process_index) {
-                    out << "thread: " << thread_id
-                        << " detect device wit foreign color: " << *last_graph_item << std::endl;
-                    continue;
-                }
-                if (per_thread.second.find(last_in_graph_index) != per_thread.second.end()) {
-                    out << "thread: " << thread_id
-                        << " wants to create scaling device by idx: " << last_in_graph_index
-                        << std::endl;
-                    if (created_numa_indices.find(last_in_graph_index) !=
-                        created_numa_indices.end()) {
-                        out << "skip existing scaling device candidate by: " << last_in_graph_index
-                            << std::endl;
-                        continue;
-                    }
-
-                    size_t inserted_device_type_index = detail::role_mod::inject_numa_device<
-                        group_id(),
-                        class_id,
-                        process_group_context,
-                        ccl_virtual_gpu_comm, /* `virtual` is better candiate */
-                        ccl_gpu_comm>(
-                        *non_indexed_plain_devices, last_in_graph_index, context, devices);
-                    if (inserted_device_type_index == std::numeric_limits<size_t>::max()) {
-                        assert(false && "Unsupported device type in topology creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(
-                            std::string("Unsupported device type in topology creation. Log:\n") +
-                            ss.str());
-                    }
-
-                    out << "Inject numa device by order: " << inserted_device_type_index
-                        << "\nby idx: " << last_in_graph_index << std::endl;
-                    created_numa_indices.insert(last_in_graph_index);
-
-                    break;
-                }
-            }
-
-            //TODO No IPC devices
-        }
-    }
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-    size_t accumulated_index_offset_for_graph = 0;
-    size_t graph_num = 0;
-    std::map<size_t /*graph_num*/, size_t /*offset*/> index_offset_for_graphs;
-    auto offset_it = process_device_rank_offset.find(process_index);
-    if (offset_it == process_device_rank_offset.end()) {
-        assert(false && "");
-    }
-
-    accumulated_index_offset_for_graph = offset_it->second;
-    auto topology_comm_addr = ctx_comm_addr;
-    topology_comm_addr.comm_size = device_cluster_size;
-
-    out << "global rank offset: " << accumulated_index_offset_for_graph << std::endl;
-
-    for (auto id_ring_it = graph_list.begin(); id_ring_it != graph_list.end(); ++id_ring_it) {
-        auto& id_ring = *id_ring_it;
-        auto local_proc_ring_start = std::find_if(
-            id_ring.begin(), id_ring.end(), [this](const native::detail::colored_idx& val) {
-                //return (val.color != process_count && val.color != native::detail::marked_color); / / first not terminator index
-                return val.color == process_index;
-            });
-
-        if (local_proc_ring_start == id_ring.end()) {
-            out << "graph fully processes: " << detail::to_string(id_ring) << ", take next"
-                << std::endl;
-            continue;
-        }
-
-        size_t index_offset = accumulated_index_offset_for_graph;
-        for (auto per_thread_it = ctx_per_thread_data.begin();
-             per_thread_it != ctx_per_thread_data.end();
-             ++per_thread_it) {
-            size_t thread_id = per_thread_it->first; //first
-
-            /** Initialize empty context**/
-            std::shared_ptr<device_community<class_id>> out_indexed_devices;
-            if (context.get_process_topology<class_id>(process_index, thread_id)
-                    .torn_apart_rings.empty()) {
-                context.get_process_topology<class_id>(process_index, thread_id)
-                    .set_additiona_topology(
-                        std::make_shared<device_community<class_id>>(topology_comm_addr));
-            }
-
-            out_indexed_devices = context.get_process_topology<class_id>(process_index, thread_id)
-                                      .get_additiona_topology(ring_index);
-
-            out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
-                << ", index offset: " << index_offset << std::endl;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices.thread_gpu_comms.find(thread_id)->second;
-
-            auto rank_builder = create_device_functor<
-                detail::smart_ring_indexer<group_id(), class_id, process_group_context>>(
-                id_ring,
-                process_index,
-                process_count,
-                index_offset,
-                0,
-                devices,
-                out_indexed_devices->get_device_storage(),
-                ccl::process_device_indices_type{},
-                ccl::process_device_indices_type{},
-                local_proc_ring_start,
-                context);
-
-            ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-            // Inject Scale out devices for the last thread
-            if (std::next(id_ring_it, 1) == graph_list.end()) {
-                if (thread_id ==
-                    ctx_per_thread_data.size() - 1) //TODO only final thread owns IPC devies at now
-                {
-                    size_t inserted_device_type_index = detail::role_mod::inject_scaleout_device<
-                        group_id(),
-                        class_id,
-                        process_group_context,
-                        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_gpu_comm>>,
-                        ccl_gpu_scaleup_proxy<ccl_numa_proxy<ccl_virtual_gpu_comm>>,
-                        ccl_gpu_scaleup_proxy<ccl_gpu_comm>,
-                        ccl_gpu_scaleup_proxy<ccl_virtual_gpu_comm>,
-                        ccl_numa_proxy<ccl_gpu_comm>,
-                        ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                        ccl_virtual_gpu_comm,
-                        ccl_gpu_comm>(out_indexed_devices->get_device_storage(),
-                                      id_ring_it->begin()->index,
-                                      context,
-                                      devices);
-                    if (inserted_device_type_index != std::numeric_limits<size_t>::max()) {
-                        out << "Inject scaleout device by order: " << inserted_device_type_index
-                            << "\nby idx: " << id_ring_it->begin()->to_string() << std::endl;
-                    }
-                }
-            }
-
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
-            out << "Indexer result for devices in thread idx (" << thread_id << "/"
-                << ctx_per_thread_data.size() << "):\n"
-                << p.to_string() << std::endl;
-
-            accumulated_index_offset_for_graph +=
-                rank_builder.get_functor().get_marked_indices_count();
-            out << "\nIndexer for graph num: " << graph_num
-                << ", finished. imarked_indices: " << accumulated_index_offset_for_graph << "\n";
-        }
-        index_offset_for_graphs[graph_num] = index_offset;
-        graph_num++;
-    }
-
-    out << "\nStart ring builder for graphs count: " << graph_list.size() << std::endl;
-    graph_num = 0;
-    for (const auto& id_ring : graph_list) {
-        out << "\nStart ring builder for graph num: " << graph_num << std::endl;
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            // find max rank in current thread device list
-            std::shared_ptr<device_community<class_id>> indexed_topology;
-            indexed_topology =
-                context.get_process_topology<class_id>(process_index, current_thread_idx)
-                    .get_additiona_topology(ring_index);
-
-            auto& indexed_devices_for_current_thread = indexed_topology->get_device_storage();
-            const auto& curr_real =
-                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_virt =
-                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = detail::
-                get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_virt =
-                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                 group_id(),
-                                                 class_id>(indexed_devices_for_current_thread,
-                                                           id_ring);
-
-            size_t tg_max_rank = std::max({ std::get<0>(curr_real),
-                                            std::get<0>(curr_virt),
-                                            std::get<0>(curr_scale_real),
-                                            std::get<0>(curr_scale_virt) });
-
-            // find thread, which will connect to current thread max rank with next_rank
-            size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-            out << "Current thread: " << current_thread_idx
-                << ", max rank candidates: " << std::get<0>(curr_real) << ", "
-                << std::get<0>(curr_virt) << ", " << std::get<0>(curr_scale_real) << ", "
-                << std::get<0>(curr_scale_virt) << ", selected max rank: " << tg_max_rank
-                << ", expected next_rank: " << next_rank << std::endl;
-
-            //Find in local threads at first
-            for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-                 next_thread_id++) {
-                if (next_thread_id == current_thread_idx) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                // search next_rank in that thread
-                std::shared_ptr<device_community<class_id>> next_indexed_topology;
-                next_indexed_topology =
-                    context.get_process_topology<class_id>(process_index, next_thread_id)
-                        .get_additiona_topology(ring_index);
-
-                auto& next_thread_ring_topology = next_indexed_topology->get_device_storage();
-                const auto& real =
-                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& virt =
-                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& scale_real =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                const auto& scale_virt =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                if (next_rank != std::min({ std::get<0>(real),
-                                            std::get<0>(virt),
-                                            std::get<0>(scale_real),
-                                            std::get<0>(scale_virt) })) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                out << "next thread: " << next_thread_id
-                    << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                    << ", " << std::get<0>(scale_real) << ", " << std::get<0>(scale_virt)
-                    << std::endl;
-
-                out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                    << ")" << std::endl;
-
-                if (next_rank == std::get<0>(real)) {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
-                            next_rank, 0, real, devices, indexed_devices_for_current_thread);
-                    out << "Added real locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(virt)) {
-                    auto locker = detail::
-                        add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                            next_rank, 0, virt, devices, indexed_devices_for_current_thread);
-                    out << "Added virtual locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_real)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup  real proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_virt)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup virtual proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                /*else
-                {
-                    assert(false && "unknown device type");
-                    std::ostringstream ss;
-                    ss << out.rdbuf();
-                    throw std::runtime_error(std::string(__FUNCTION__) + " - unknown device type. Log:\n" +
-                                             ss.str());
-                }*/
-            }
-
-            //if not find in process local threads - use IPC to find
-            /*if (!find_in_current_process and !ipc_comms.empty())
-            {
-                out << "Find IPC device\n";
-                bool find = false;
-                for (const auto& process_ipc_comms : ipc_comms)
-                {
-                    indexed_device_container<ccl_ipc_gpu_comm>& curr_locker_map =
-                            std::get<ccl_ipc_gpu_comm::type_idx()>(indexed_devices_for_current_thread);
-
-                    auto ipc_it = process_ipc_comms.second.find(next_rank);
-                    if(ipc_it == process_ipc_comms.second.end())
-                    {
-                        out << "skip process index: " << process_ipc_comms.first << std::endl;
-                        continue;
-                    }
-                    find = true;
-                    out << "Lock IPC ring for threads (" << current_thread_idx << " <-> xxx\")" << std::endl;
-                    const auto& comm_addr = ipc_it->second->template get_comm_data<type(), group_id()>();
-                    curr_locker_map.insert({comm_addr.rank, ipc_it->second});
-                    out << "Added locker for thread idx: " << current_thread_idx  <<":\n" << ipc_it->second->to_string() << std::endl;
-                }
-
-                if (!find)
-                {
-                    std::stringstream ss;
-                    ss << out.rdbuf();
-                    std::cerr << "Cannot find IPC deice by rank: " << next_rank << "\nPrevious log:\n" << ss.str() <<"\nAbort Program" << std::endl;
-                    abort();
-                }
-
-                //upgrade left gpu device to IPC SOURCE type
-                if ( current_thread_idx == 0 / * left comm is IPC comm for last process* / )
-                {
-                    const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, type(), group_id()>(indexed_devices_for_current_thread, id_ring);
-                    const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, type(), group_id()>(indexed_devices_for_current_thread, id_ring);
-
-                    size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
-                    out << "Upgrade thread id: " << current_thread_idx
-                        << " GPU by rank: " << left_ipc_source_rank
-                        << " to IPC SOURCE GPU" << std::endl;
-
-                    if(left_ipc_source_rank == std::get<0>(real))
-                    {
-                        auto locker =
-                                    detail::add_ipc_source_locker_device<ccl_gpu_comm,
-                                                                        type(), group_id()>(next_rank,
-                                                                                   0,
-                                                                                   real,
-                                                                                   devices,indexed_devices_for_current_thread);
-                        out << "Upgrage REAL to IPC_REAL_SOURCE locker by rank: " << next_rank
-                            << ", for thread idx: " << current_thread_idx  <<":\n"
-                            << locker->to_string() << std::endl;
-                    }
-                    else if (left_ipc_source_rank == std::get<0>(virt))
-                    {
-                        auto locker =
-                                detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
-                                                                  type(), group_id()>(next_rank,
-                                                                                 0,
-                                                                                 virt,
-                                                                                 devices,indexed_devices_for_current_thread);
-                        out << "Upgrage VIRTUAL to IPC_VIRT_SOURCE locker by rank: " << next_rank
-                            << ", for thread idx: " << current_thread_idx  <<":\n"
-                            << locker->to_string() << std::endl;
-                    }
-                }
-            }
-            */
-        }
-        graph_num++;
-    }
-
-    {
-        //print topology
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            std::shared_ptr<device_community<class_id>> indexed_topology;
-            indexed_topology =
-                context.get_process_topology<class_id>(process_index, current_thread_idx)
-                    .get_additiona_topology(ring_index);
-
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(indexed_topology->get_device_storage(), p);
-            std::stringstream ss;
-            ss << "Builder result for devices in thread idx (" << current_thread_idx << "/"
-               << ctx_per_thread_data.size() << "):\n"
-               << p.to_string() << std::endl;
-            const std::string& str = ss.str();
-            LOG_DEBUG(str);
-            out << str;
-        }
-    }
-    return true;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if 0
-bool allied_process_group_ring_topology::build_specific(std::ostream& out,
-                                                        const ccl::process_device_indices_type& per_thread_device_indices,
-                                                        const detail::plain_graph_list& graph_list)
-{
-     constexpr ccl::group_split_type topology_type =
-                                        ccl::group_split_type::process_group_torn_apart_ring;
-
-    out << "Start building topology: " << ::to_string(topology_type)
-        << ", for graphs: " << graph_list.size() << "\n";
-    for (const auto& graph : graph_list)
-    {
-        out << "\n\t{";
-        for(const auto& id : graph)
-        {
-            out << id << ", ";
-        }
-        out << "},";
-    }
-
-    auto& ctx_per_thread_data = context.process_device_topology;
-    out << "\nStart gpu comm transformation scael-up for graph list count: "
-        << graph_list.size() << std::endl;
-    std::set<ccl::device_index_type> created_scaleup_indices;
-
-    // let's start scale-up devices search & creation
-    for (const auto& id_ring : graph_list)
-    {
-        for(const auto& per_thread : per_thread_device_indices)
-        {
-            size_t thread_id = per_thread.first;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                                                devices.thread_gpu_comms.find(thread_id)->second;
-            // create device comm wrappers and upgrade last devices in list up to scale_up_proxy type
-            const ccl::device_index_type& last_in_graph_index = *id_ring.rbegin();
-            if (per_thread.second.find(last_in_graph_index) != per_thread.second.end())
-            {
-                out << "thread: " << thread_id << " wants to create scale_up device by idx: "
-                    << last_in_graph_index << std::endl;
-                if (created_scaleup_indices.find(last_in_graph_index) != created_scaleup_indices.end())
-                {
-                    out << "skip existing scale_up device candidate by: " << last_in_graph_index << std::endl;
-                    continue;
-                }
-
-                auto scale_virt = detail::add_numa_proxy_device<ccl_virtual_gpu_comm, topology_type>(
-                                                                        *non_indexed_plain_devices,
-                                                                        last_in_graph_index,
-                                                                        context,
-                                                                        devices);
-                if (scale_virt)
-                {
-                    created_scaleup_indices.insert(last_in_graph_index);
-                    out << "added scaleup virtual device: " << scale_virt->to_string()
-                        << ", by idx: " << last_in_graph_index << std::endl;
-                }
-                else
-                {
-                    auto scale_real = detail::add_numa_proxy_device<ccl_gpu_comm, topology_type>(
-                                                                        *non_indexed_plain_devices,
-                                                                        last_in_graph_index,
-                                                                        context,
-                                                                        devices);
-                    if (scale_real)
-                    {
-                        created_scaleup_indices.insert(last_in_graph_index);
-                        out << "added scaleup real device: " << scale_real->to_string()
-                            << ", by idx: " << last_in_graph_index << std::endl;
-                    }
-                    else
-                    {
-                        assert(false && "Unsupported device type in torn-apart ring creation");
-                        std::ostringstream ss;
-                        ss << out.rdbuf();
-                        throw std::runtime_error(std::string("Unsupported device type in torn-apart ring creation. Log:\n") +
-                                                 ss.str());
-                    }
-                }
-            }
-        }
-    }
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-    detail::ipc_devices_pool ipc_comms;
-    size_t accumulated_index_offset_for_graph = 0;
-    size_t graph_num = 0;
-    std::map<size_t/*graph_num*/, size_t /*offset*/> index_offset_for_graphs;
-    for (const auto& id_ring : graph_list)
-    {
-        detail::id_thread_table assigned_ids;  //device_id -> thread_id
-
-        std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(id_ring);  // marked graph
-
-        size_t index_offset = accumulated_index_offset_for_graph;
-        for (auto per_thread_it = ctx_per_thread_data.begin(); per_thread_it != ctx_per_thread_data.end();
-            ++per_thread_it)
-        {
-            size_t thread_id = per_thread_it->first;        //first
-            /** Initialize empty context**/
-            std::shared_ptr<device_community<topology_type>> out_indexed_devices;
-            if (graph_list.size() == 1) {
-                if (context.get_process_topology<topology_type>(process_index, thread_id)
-                        .closed_rings.empty()) {
-                    context.get_process_topology<topology_type>(process_index, thread_id)
-                        .set_topology(
-                            std::make_shared<device_community<topology_type>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<topology_type>(process_index, thread_id)
-                        .get_topology(ring_index);
-            }
-            else {
-                if (context.get_process_topology<topology_type>(process_index, thread_id)
-                        .torn_apart_rings.empty()) {
-                    context.get_process_topology<topology_type>(process_index, thread_id)
-                        .set_additiona_topology(
-                            std::make_shared<device_community<topology_type>>(topology_comm_addr));
-                }
-
-                out_indexed_devices =
-                    context.get_process_topology<topology_type>(process_index, thread_id)
-                        .get_additiona_topology(ring_index);
-            }
-
-
-
-            out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id << std::endl;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                                                    devices.thread_gpu_comms.find(thread_id)->second;
-
-            // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
-            auto rank_builder =
-                    create_device_functor<detail::graph_ring_indexer_unique_index_ext<topology_type>>(marked_id_ring,
-                                                                                      assigned_ids,
-                                                                                      thread_id,
-                                                                                      out_indexed_devices,
-                                                                                      index_offset + device_cluster_rank_offset,
-                                                                                      0,
-                                                                                      0);
-//                                                                                    device_cluster_size
-
-            ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-            detail::printer<type(), topology_type> p;
-            ccl_tuple_for_each(out_indexed_devices, p);
-            out << "Indexer result for devices in thread idx ("
-                << thread_id << "/" << ctx_per_thread_data.size() << "):\n"
-                << p.to_string() << std::endl;
-
-            accumulated_index_offset_for_graph += rank_builder.get_functor().get_marked_indices_count();
-            out << "\nIndexer for graph num: " << graph_num
-                << ", finished. imarked_indices: " << accumulated_index_offset_for_graph <<"\n";
-        }
-        index_offset_for_graphs[graph_num] = index_offset;
-
-
-        out << "\nStart gpu comm transformation ipc for graph num: "
-            << graph_num << std::endl;
-
-        //allocate IPC devices pool with rank from unassigned IDs
-        detail::ipc_devices_pool tmp_ipc_comms =
-                        detail::create_ipc_gpu_comms<type(), topology_type>(assigned_ids, id_ring, devices,
-                                                                     device_cluster_size,
-                                                                     device_cluster_rank_offset);
-        out << "Created Tmp IPC devices: " << tmp_ipc_comms.size()
-            << ", for cluster_size: " << device_cluster_size
-            << ", with device_cluster_rank_offset: " << device_cluster_rank_offset << "\n";
-        for (const auto& ipc : tmp_ipc_comms)
-        {
-            out << "{ rank: " << ipc.first << ", comm: " << ipc.second->to_string() << "}\n";
-        }
-        ipc_comms.insert(tmp_ipc_comms.begin(), tmp_ipc_comms.end());
-        graph_num ++;
-    }
-
-
-
-    out << "\nStart ring builder for graphs count: " << graph_list.size() << std::endl;
-    graph_num = 0;
-    for (const auto& id_ring : graph_list)
-    {
-        out << "\nStart ring builder for graph num: " << graph_num << std::endl;
-        for(size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size(); current_thread_idx++)
-        {
-            // find max rank in current thread device list
-            auto& indexed_devices_for_current_thread =
-                    context.get_process_topology<topology_type>(process_index,
-                                                                current_thread_idx).get_topology()->get_device_storage();
-            const auto& curr_real =
-                    detail::get_device_with_min_rank<ccl_gpu_comm, type(), topology_type>(
-                                                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_virt =
-                    detail::get_device_with_min_rank<ccl_virtual_gpu_comm, type(), topology_type>(
-                                                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real =
-                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, type(), topology_type>(
-                                                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_virt =
-                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, type(), topology_type>(
-                                                    indexed_devices_for_current_thread, id_ring);
-
-            size_t tg_max_rank = std::max({std::get<0>(curr_real), std::get<0>(curr_virt),
-                                           std::get<0>(curr_scale_real), std::get<0>(curr_scale_virt)});
-
-            // find thread, which will connect to current thread max rank with next_rank
-            size_t next_rank = (tg_max_rank + 1 ) % id_ring.size();
-            out << "Current thread: " << current_thread_idx << ", max rank candidates: "
-                << std::get<0>(curr_real) << ", " << std::get<0>(curr_virt) << ", "
-                << std::get<0>(curr_scale_real) << ", " << std::get<0>(curr_scale_virt)
-                << ", selected max rank: " << tg_max_rank
-                << ", expected next_rank: " << next_rank << std::endl;
-
-            //Find in local threads at first
-            bool find_in_current_process = false;
-            for(size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size(); next_thread_id++)
-            {
-                if( next_thread_id == current_thread_idx)
-                {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                // search next_rank in that thread
-                auto& next_thread_ring_topology =
-                        context.get_process_topology<topology_type>(process_index,
-                                                                    next_thread_id)->get_device_storage();
-                const auto& real =
-                        detail::get_device_with_max_rank<ccl_gpu_comm, type(), topology_type>(
-                                                            next_thread_ring_topology, id_ring);
-                const auto& virt =
-                        detail::get_device_with_max_rank<ccl_virtual_gpu_comm, type(), topology_type>(
-                                                            next_thread_ring_topology, id_ring);
-                const auto& scale_real =
-                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>, type(), topology_type>(
-                                                            next_thread_ring_topology, id_ring);
-                const auto& scale_virt =
-                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, type(), topology_type>(
-                                                            next_thread_ring_topology, id_ring);
-                if (next_rank != std::min({std::get<0>(real), std::get<0>(virt),
-                                           std::get<0>(scale_real), std::get<0>(scale_virt)}))
-                {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                out << "next thread: " << next_thread_id << ", min rank candidates: "
-                    << std::get<0>(real) << ", " << std::get<0>(virt) << ", "
-                    << std::get<0>(scale_real) << ", " << std::get<0>(scale_virt) << std::endl;
-
-                find_in_current_process = true;
-                out << "Lock ring for threads ("
-                    << current_thread_idx << " <-> "<< next_thread_id << ")" << std::endl;
-
-                if (next_rank == std::get<0>(real))
-                {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_gpu_comm, type(), topology_type>(next_rank,
-                                                                                       0,
-                                                                                       real,
-                                                                                       devices,indexed_devices_for_current_thread);
-                    out << "Added real locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(virt))
-                {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_virtual_gpu_comm, type(), topology_type>(next_rank,
-                                                                                               0,
-                                                                                               virt,
-                                                                                               devices,indexed_devices_for_current_thread);
-                    out << "Added virtual locker by index: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_real))
-                {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup  real proxy in current thread: " << current_thread_idx << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_virt))
-                {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for scaleup virtual proxy in current thread: " << current_thread_idx << std::endl;
-                }
-                /*else
-                {
-                    assert(false && "unknown device type");
-                    std::ostringstream ss;
-                    ss << out.rdbuf();
-                    throw std::runtime_error(std::string(__FUNCTION__) + " - unknown device type. Log:\n" +
-                                             ss.str());
-                }*/
-            }
-
-            //if not find in process local threads - use IPC to find
-            if (!find_in_current_process and !ipc_comms.empty())
-            {
-                indexed_device_container<ccl_ipc_gpu_comm>& curr_locker_map =
-                            std::get<ccl_ipc_gpu_comm::type_idx()>(indexed_devices_for_current_thread);
-
-                out << "Lock IPC ring for threads (" << current_thread_idx << " <-> xxx\")" << std::endl;
-                auto ipc_it = ipc_comms.find(next_rank);
-                if(ipc_it == ipc_comms.end())
-                {
-                    std::stringstream ss;
-                    ss << out.rdbuf();
-                    std::cerr << "Cannot find IPC deice by rank: " << next_rank << "\nPrevious log:\n" << ss.str() <<"\nAbort Program" << std::endl;
-                    abort();
-                }
-                const auto& comm_addr = ipc_it->second->template get_comm_data<type(), topology_type>();
-                curr_locker_map.insert({comm_addr.rank, ipc_it->second});
-                out << "Added locker for thread idx: " << current_thread_idx  <<":\n" << ipc_it->second->to_string() << std::endl;
-            }
-
-            //upgrade left gpu device to IPC SOURCE type
-            if (!ipc_comms.empty() /*has another IPC Device*/ and current_thread_idx == 0 /* left comm is IPC comm for last process*/ )
-            {
-                const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, type(), topology_type>(indexed_devices_for_current_thread, id_ring);
-                const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, type(), topology_type>(indexed_devices_for_current_thread, id_ring);
-
-                size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
-                out << "Upgrade thread id: " << current_thread_idx
-                    << " GPU by rank: " << left_ipc_source_rank
-                    << " to IPC SOURCE GPU" << std::endl;
-
-                if(left_ipc_source_rank == std::get<0>(real))
-                {
-                    auto locker =
-                                detail::add_ipc_source_locker_device<ccl_gpu_comm,
-                                                                    type(), topology_type>(next_rank,
-                                                                                   0,
-                                                                                   real,
-                                                                                   devices,indexed_devices_for_current_thread);
-                    out << "Upgrage REAL to IPC_REAL_SOURCE locker by rank: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (left_ipc_source_rank == std::get<0>(virt))
-                {
-                    auto locker =
-                                detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
-                                                                  type(), topology_type>(next_rank,
-                                                                                 0,
-                                                                                 virt,
-                                                                                 devices,indexed_devices_for_current_thread);
-                    out << "Upgrage VIRTUAL to IPC_VIRT_SOURCE locker by rank: " << next_rank
-                        << ", for thread idx: " << current_thread_idx  <<":\n"
-                        << locker->to_string() << std::endl;
-                }
-            }
-            graph_num++;
-        }
-    }
-    return true;
-}
-
-bool allied_process_group_ring_topology::build_specific(std::ostream& out,
-                                                        const ccl::process_device_indices_type& per_thread_device_indices,
-                                                        const ccl::device_indices_type& scaleout_device_indices,
-                                                        const detail::plain_graph_list& graph_list)
-{
-    out << "TODO: Not implemented";
-    return false;
-}
-
-bool allied_process_group_ring_topology::build_specific_scale_up_out(
-                        std::ostream& out,
-                        const ccl::process_device_indices_type& per_thread_device_indices,
-                        const ccl::process_device_indices_type& scaleout_device_indices,
-                        const ccl::process_device_indices_type& ipc_device_indices,
-                        detail::colored_plain_graph_list& graph_list,
-                        const std::map<size_t, size_t>& process_device_rank_offset)
-{
-    out << "TODO: Not implemented";
-    return false;
-}
-detail::global_sorted_plain_graphs
-        allied_process_group_ring_topology::global_graph_list_resolver(
-                                const detail::adjacency_matrix& matrix,
-                                const ccl::process_device_indices_type& per_process_device_indexes,
-                                const ccl::process_device_indices_type& foreign_processes_device_indexes,
-                                detail::p2p_rating_function ping)
-{
-    detail::global_sorted_plain_graphs global_graph_list;
-
-    {
-        detail::plain_graph_list my_process_list = detail::graph_list_resolver(matrix,
-                                                                                 per_process_device_indexes,
-                                                                                 ping);
-        global_graph_list.emplace(process_index, std::move(my_process_list));
-    }
-
-    /*                        my_process_list
-     *  <---unknown ring------> [ <> <> ] <---unknown ring---->
-     *
-     * [------------------------------------------------------]
-                        - global size-
-     *
-     * [<><><>]                 [ <> <> ]               [<><><>]
-     *
-     * left_index:[<><><>]    my_index:[ <> <> ]    right_index:[<><><>]
-     *                ||                  |   |                   | |
-     * >______________||__________________|   |___________________| |_____>
-     *                   local_comm_group_1     local_comm_group_2     local_i
-     */
-
-
-
-    return global_graph_list;
-}
-#endif
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp
deleted file mode 100644
index 2c77c54e0..000000000
--- a/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-
-namespace native {
-// First aggregate all devices into from different threads into plain vector represent devices from one host
-// take action for simple enumerator on this plain vector
-// check against connectivity from plain vector from other processes
-// add IPC wrapper!
-
-//use collcected devices count from cluster per hostname & processes to take offset for current devices!!!!
-class allied_process_group_ring_topology {
-private:
-    size_t process_index;
-    size_t process_count;
-    process_group_context& context;
-    device_storage& devices;
-    size_t device_cluster_rank_offset;
-    size_t device_cluster_size;
-    ccl::context_comm_addr ctx_comm_addr;
-
-public:
-    static constexpr const char* name() {
-        return "process_group_ring_creator";
-    }
-
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::cluster;
-    }
-
-    allied_process_group_ring_topology(size_t process_idx,
-                                       size_t process_nums,
-                                       process_group_context& ctx,
-                                       device_storage& devs,
-                                       size_t cluster_rank_offset,
-                                       size_t cluster_size,
-                                       const ccl::context_comm_addr& comm_addr = {});
-    virtual ~allied_process_group_ring_topology() = default;
-    static std::pair<size_t, size_t> calculate_rank_offset_with_size(
-        size_t process_id,
-        const std::string& host_id,
-        const ccl::cluster_aggregated_device_mask_t& cluster_affinity_mask);
-
-    static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                         const ccl_device& rhs);
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::process_aggregated_device_mask_t& node_device_masks,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::process_device_indices_type& node_device_indices,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    bool build_all(std::ostream& out,
-                   const ccl::process_device_indices_type& per_thread_device_indices,
-                   const detail::adjacency_matrix& matrix,
-                   detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-private:
-    bool build_specific_colored(std::ostream& out,
-                                const ccl::process_device_indices_type& per_thread_device_indices,
-                                const ccl::process_device_indices_type& ipc_device_indices,
-                                detail::colored_plain_graph& graph,
-                                const std::map<size_t, size_t>& process_device_rank_offset);
-
-    bool build_specific_scale_up(std::ostream& out,
-                                 const ccl::process_device_indices_type& per_thread_device_indices,
-                                 const ccl::process_device_indices_type& ipc_device_indices,
-                                 detail::colored_plain_graph_list& graph_list,
-                                 const std::map<size_t, size_t>& process_device_rank_offset);
-
-    bool build_specific_scale_out_only(
-        std::ostream& out,
-        const ccl::process_device_indices_type& per_thread_device_indices,
-        const ccl::process_device_indices_type& scaleout_device_indices,
-        detail::colored_plain_graph_list& graph_list,
-        const std::map<size_t, size_t>& process_device_rank_offset);
-    /*
-    bool build_specific_scale_up_out(std::ostream& out,
-                        const ccl::process_device_indices_type& per_thread_device_indices,
-                        const ccl::process_device_indices_type& scaleout_device_indices,
-                        const ccl::process_device_indices_type& ipc_device_indices,
-                        detail::colored_plain_graph_list& graph_list,
-                        const std::map<size_t, size_t>& process_device_rank_offset);
-*/
-    detail::plain_graph_list create_my_process_graphs(
-        const ccl::process_device_indices_type& per_thread_device_indices,
-        const detail::adjacency_matrix& matrix,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    detail::global_sorted_plain_graphs collect_cluster_plain_graphs(
-        std::ostream& out,
-        std::shared_ptr<ccl::host_communicator> comm,
-        size_t process_index,
-        const detail::plain_graph_list& my_process_graph);
-    detail::global_sorted_colored_plain_graphs collect_cluster_colored_plain_graphs(
-        std::ostream& out,
-        std::shared_ptr<ccl::host_communicator> comm,
-        size_t process_index,
-        const detail::colored_plain_graph_list& my_process_graph);
-
-    virtual detail::global_plain_graphs merge_allied_nodes_plain_graphs(
-        std::ostream& out,
-        const ccl::cluster_device_indices_type& cluster_indices,
-        size_t process_index,
-        const detail::global_sorted_plain_graphs& cluster_graphs,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    virtual detail::global_colored_plain_graphs merge_allied_nodes_in_colored_plain_graphs(
-        std::ostream& out,
-        const ccl::cluster_device_indices_type& cluster_indices,
-        size_t process_index,
-        size_t process_count,
-        const detail::global_sorted_colored_plain_graphs& cluster_graphs,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    detail::plain_graph_list resize_merged_graphs_for_process(
-        size_t process_index,
-        const detail::global_plain_graphs& merged_cluster_graphs,
-        const detail::plain_graph_list& original_graph_list,
-        std::ostream& out);
-    detail::colored_plain_graph_list resize_merged_colored_graphs_for_process(
-        size_t process_index,
-        size_t process_count,
-        const detail::global_colored_plain_graphs& merged_cluster_graphs,
-        const detail::colored_plain_graph_list& original_graph_list,
-        std::ostream& out);
-
-    virtual ccl::process_device_indices_type create_scaleout_devices_in_graphs_for_process(
-        size_t process_index,
-        size_t cluster_size,
-        detail::global_sorted_plain_graphs& cluster_graphs,
-        std::ostream& out);
-    virtual ccl::process_device_indices_type create_scaleout_devices_in_colored_graphs_for_process(
-        size_t process_index,
-        size_t cluster_size,
-        detail::global_sorted_colored_plain_graphs& cluster_graphs,
-        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
-        std::ostream& out);
-    virtual ccl::process_device_indices_type create_ipc_devices_in_colored_graphs_for_process(
-        size_t process_idx,
-        size_t cluster_size,
-        detail::global_sorted_colored_plain_graphs& cluster_graphs,
-        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
-        std::ostream& out);
-
-    detail::global_sorted_plain_graphs global_graph_list_resolver(
-        const detail::adjacency_matrix& matrix,
-        const ccl::process_device_indices_type& per_process_device_indexes,
-        const ccl::process_device_indices_type& foreign_processes_device_indexes,
-        detail::p2p_rating_function ping);
-};
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
deleted file mode 100644
index 612a62306..000000000
--- a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
+++ /dev/null
@@ -1,1448 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <map>
-#include <memory>
-
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-#include "oneapi/ccl/config.h"
-
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-#include "common/comm/l0/devices/devices_declaration.hpp"
-#include "common/comm/l0/topology/ring_topology.hpp"
-#include "common/comm/l0/device_community.hpp"
-#include "common/comm/l0/context/thread_group_ctx.hpp"
-#include "common/comm/l0/context/process_group_ctx.hpp"
-
-#include "common/comm/l0/context/device_storage.hpp"
-
-/*REFACTORING*/
-#include "common/comm/l0/context/scale/numa/numa_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/scale_up/scale_up_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/scale_out/scale_out_ctx_impl.hpp"
-#include "common/comm/l0/context/scale/ipc/ipc_ctx_impl.hpp"
-/*REFACTORING*/
-
-namespace native {
-
-namespace detail {
-
-namespace helper {
-
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context>
-device_t_ptr<ccl_numa_proxy<device_t>> add_numa_proxy_device(
-    specific_plain_device_storage& plain_storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    device_t_ptr<ccl_numa_proxy<device_t>> ret;
-    plain_device_container<device_t>& container = std::get<device_t::type_idx()>(plain_storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if ((*it)->get_device().get_device_path() == index) {
-            // promote device candidate to scaleup type
-            device_t_ptr<device_t> device = *it;
-            container.erase(it);
-
-            ret = device_factory.create_gpu_device<ccl_numa_proxy<device_t>>(
-                device->get_device(), container.size(), *device);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_numa_ctx());
-            std::get<ccl_numa_proxy<device_t>::type_idx()>(plain_storage).push_back(ret);
-            break;
-        }
-    }
-    return ret;
-}
-
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class = typename std::enable_if<
-              not std::is_same<device_t, ccl_thread_comm<typename device_t::impl_t>>::value>::type>
-device_t_ptr<ccl_gpu_scaleup_proxy<device_t>> add_scaleup_device(
-    specific_indexed_device_storage& storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    device_t_ptr<ccl_gpu_scaleup_proxy<device_t>> ret;
-    indexed_device_container<device_t>& container = std::get<device_t::type_idx()>(storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if (it->second->get_device().get_device_path() == index) {
-            // promote device candidate to scaleup type
-            size_t index = it->first;
-            device_t_ptr<device_t> device = it->second;
-            container.erase(it);
-
-            ret = device_factory.create_gpu_device<ccl_gpu_scaleup_proxy<device_t>>(
-                device->get_device(), container.size(), *device);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_scaleup_ctx());
-            auto inserted =
-                std::get<ccl_gpu_scaleup_proxy<device_t>::type_idx()>(storage).emplace(index, ret);
-
-            if (!inserted.second) {
-                throw std::runtime_error(
-                    std::string(__PRETTY_FUNCTION__) + " - cannot promoted device wrapper: " +
-                    device->to_string() + " by index: " + std::to_string(index) +
-                    " - to scaleup, because it exist already: " +
-                    inserted.first->second->to_string());
-            }
-            break;
-        }
-    }
-    return ret;
-}
-
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          typename std::enable_if<
-              std::is_same<device_t, ccl_thread_comm<typename device_t::impl_t>>::value,
-              int>::type = 0>
-device_t_ptr<ccl_gpu_scaleup_proxy<device_t>> add_scaleup_device(
-    specific_indexed_device_storage& storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    using impl_device_t = typename device_t::impl_t;
-    device_t_ptr<ccl_gpu_scaleup_proxy<impl_device_t>> ret;
-    indexed_device_container<device_t>& container = std::get<device_t::type_idx()>(storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if (it->second->get_device().get_device_path() == index) {
-            // promote device candidate to scaleup type
-            size_t index = it->first;
-            device_t_ptr<device_t> device = it->second;
-            container.erase(it);
-
-            impl_device_t& core_dev = device->get_impl_device();
-            ret = device_factory.create_gpu_device<ccl_gpu_scaleup_proxy<impl_device_t>>(
-                device->get_device(), container.size(), core_dev);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_scaleup_ctx());
-            auto inserted =
-                std::get<ccl_gpu_scaleup_proxy<impl_device_t>::type_idx()>(storage).emplace(index,
-                                                                                            ret);
-            if (!inserted.second) {
-                throw std::runtime_error(
-                    std::string(__PRETTY_FUNCTION__) + " - cannot promoted device wrapper: " +
-                    device->to_string() + " by index: " + std::to_string(index) +
-                    " - to scaleup, because it exist already: " +
-                    inserted.first->second->to_string());
-            }
-            break;
-        }
-    }
-    return ret;
-}
-
-//-S- correct version
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class = typename std::enable_if<
-              not std::is_same<device_t, ccl_thread_comm<typename device_t::impl_t>>::value>::type>
-device_t_ptr<ccl_ipc_source_gpu_comm<device_t>> add_ipc_src_device(
-    specific_indexed_device_storage& storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    device_t_ptr<ccl_ipc_source_gpu_comm<device_t>> ret;
-    indexed_device_container<device_t>& container = std::get<device_t::type_idx()>(storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if (it->second->get_device().get_device_path() == index) {
-            // promote device candidate to ipc_src type
-            size_t index = it->first;
-            device_t_ptr<device_t> device = it->second;
-            container.erase(it);
-
-            ret = device_factory.create_gpu_device<ccl_ipc_source_gpu_comm<device_t>>(
-                device->get_device(), container.size(), *device, group_id, class_id);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_ipc_ctx());
-            auto inserted =
-                std::get<ccl_ipc_source_gpu_comm<device_t>::type_idx()>(storage).emplace(index,
-                                                                                         ret);
-
-            if (!inserted.second) {
-                throw std::runtime_error(
-                    std::string(__PRETTY_FUNCTION__) + " - cannot promoted device wrapper: " +
-                    device->to_string() + " by index: " + std::to_string(index) +
-                    " - to ipc_src, because it exist already: " +
-                    inserted.first->second->to_string());
-            }
-            break;
-        }
-    }
-    return ret;
-}
-
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class = typename std::enable_if<
-              not std::is_same<device_t, ccl_thread_comm<typename device_t::impl_t>>::value>::type>
-device_t_ptr<ccl_scaleout_proxy<device_t>> add_scaleout_device(
-    specific_indexed_device_storage& storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    device_t_ptr<ccl_scaleout_proxy<device_t>> ret;
-    indexed_device_container<device_t>& container = std::get<device_t::type_idx()>(storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if (it->second->get_device().get_device_path() == index) {
-            // promote device candidate to scaleup type
-            size_t index = it->first;
-            device_t_ptr<device_t> device = it->second;
-            container.erase(it);
-
-            ret = device_factory.create_gpu_device<ccl_scaleout_proxy<device_t>>(
-                device->get_device(), container.size(), *device);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_scaleout_ctx());
-            auto inserted =
-                std::get<ccl_scaleout_proxy<device_t>::type_idx()>(storage).emplace(index, ret);
-            if (!inserted.second) {
-                throw std::runtime_error(
-                    std::string(__PRETTY_FUNCTION__) + " - cannot promoted device wrapper: " +
-                    device->to_string() + " by index: " + std::to_string(index) +
-                    " - to scaleout, because it exist already: " +
-                    inserted.first->second->to_string());
-            }
-            break;
-        }
-    }
-    return ret;
-}
-
-template <class device_t,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          typename std::enable_if<
-              std::is_same<device_t, ccl_thread_comm<typename device_t::impl_t>>::value,
-              int>::type = 0>
-device_t_ptr<ccl_scaleout_proxy<device_t>> add_scaleout_device(
-    specific_indexed_device_storage& storage,
-    const ccl::device_index_type& index,
-    context& context_to_register,
-    device_storage& device_factory) {
-    using impl_device_t = typename device_t::impl_t;
-
-    device_t_ptr<ccl_scaleout_proxy<impl_device_t>> ret;
-    indexed_device_container<device_t>& container = std::get<device_t::type_idx()>(storage);
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        // find device candidate
-        if (it->second->get_device().get_device_path() == index) {
-            // promote device candidate to scaleup type
-            size_t index = it->first;
-            device_t_ptr<device_t> device = it->second;
-            container.erase(it);
-
-            impl_device_t& core_dev = device->get_impl_device();
-
-            ret = device_factory.create_gpu_device<ccl_scaleout_proxy<impl_device_t>>(
-                device->get_device(), container.size(), *device);
-            ret->template assign<group_id, class_id>(context_to_register,
-                                                     context_to_register.get_scaleout_ctx());
-            auto inserted =
-                std::get<ccl_scaleout_proxy<impl_device_t>::type_idx()>(storage).emplace(index,
-                                                                                         ret);
-            if (!inserted.second) {
-                throw std::runtime_error(
-                    std::string(__PRETTY_FUNCTION__) + " - cannot promoted device wrapper: " +
-                    device->to_string() + " by index: " + std::to_string(index) +
-                    " - to scaleout, because it exist already: " +
-                    inserted.first->second->to_string());
-            }
-            break;
-        }
-    }
-    return ret;
-}
-
-} // namespace helper
-namespace role_mod {
-template <ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class... device_candidate_t>
-size_t inject_scaleup_device(specific_indexed_device_storage& storage,
-                             const ccl::device_index_type& index,
-                             context& context_to_register,
-                             device_storage& device_factory) {
-    bool created = false;
-    std::array<bool, sizeof...(device_candidate_t)> expander{ (
-        created =
-            (created == false
-                 ? (bool)
-                       helper::add_scaleup_device<device_candidate_t, group_id, class_id, context>(
-                           storage, index, context_to_register, device_factory)
-                 : created))... };
-    auto inserted_it = std::find(expander.begin(), expander.end(), true);
-
-    return inserted_it != expander.end() ? std::distance(expander.begin(), inserted_it)
-                                         : std::numeric_limits<size_t>::max();
-}
-
-template <ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class... device_candidate_t>
-size_t inject_numa_device(specific_plain_device_storage& plain_storage,
-                          const ccl::device_index_type& index,
-                          context& context_to_register,
-                          device_storage& device_factory) {
-    bool created = false;
-    std::array<bool, sizeof...(device_candidate_t)> expander{ (
-        created = (created == false
-                       ? (bool)helper::
-                             add_numa_proxy_device<device_candidate_t, group_id, class_id, context>(
-                                 plain_storage, index, context_to_register, device_factory)
-                       : created))... };
-    auto inserted_it = std::find(expander.begin(), expander.end(), true);
-
-    return inserted_it != expander.end() ? std::distance(expander.begin(), inserted_it)
-                                         : std::numeric_limits<size_t>::max();
-}
-
-template <ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class... device_candidate_t>
-size_t inject_ipc_src_device(specific_indexed_device_storage& storage,
-                             const ccl::device_index_type& index,
-                             context& context_to_register,
-                             device_storage& device_factory) {
-    bool created = false;
-    std::array<bool, sizeof...(device_candidate_t)> expander{ (
-        created =
-            (created == false
-                 ? (bool)
-                       helper::add_ipc_src_device<device_candidate_t, group_id, class_id, context>(
-                           storage, index, context_to_register, device_factory)
-                 : created))... };
-    auto inserted_it = std::find(expander.begin(), expander.end(), true);
-
-    return inserted_it != expander.end() ? std::distance(expander.begin(), inserted_it)
-                                         : std::numeric_limits<size_t>::max();
-}
-
-template <ccl::group_split_type group_id,
-          ccl::device_topology_type class_id,
-          class context,
-          class... device_candidate_t>
-size_t inject_scaleout_device(specific_indexed_device_storage& storage,
-                              const ccl::device_index_type& index,
-                              context& context_to_register,
-                              device_storage& device_factory) {
-    bool created = false;
-    std::array<bool, sizeof...(device_candidate_t)> expander{ (
-        created =
-            (created == false
-                 ? (bool)
-                       helper::add_scaleout_device<device_candidate_t, group_id, class_id, context>(
-                           storage, index, context_to_register, device_factory)
-                 : created))... };
-    auto inserted_it = std::find(expander.begin(), expander.end(), true);
-
-    return inserted_it != expander.end() ? std::distance(expander.begin(), inserted_it)
-                                         : std::numeric_limits<size_t>::max();
-}
-} // namespace role_mod
-
-inline std::vector<marked_idx> create_marked(const plain_graph& id_vector) {
-    std::vector<marked_idx> ret;
-    ret.reserve(id_vector.size());
-
-    std::transform(id_vector.begin(),
-                   id_vector.end(),
-                   std::back_inserter(ret),
-                   [](const ccl::device_index_type& idx) {
-                       return marked_idx(false, idx);
-                   });
-    return ret;
-}
-
-inline colored_plain_graph create_colored(const plain_graph& id_vector, color_t color) {
-    colored_plain_graph ret;
-    ret.reserve(id_vector.size());
-
-    std::transform(id_vector.begin(),
-                   id_vector.end(),
-                   std::back_inserter(ret),
-                   [color](const ccl::device_index_type& idx) {
-                       return colored_idx(color, idx);
-                   });
-    return ret;
-}
-
-inline colored_plain_graph_list create_colored(const plain_graph_list& list, color_t color) {
-    colored_plain_graph_list ret;
-    for (const plain_graph& graph : list) {
-        ret.emplace_back(create_colored(graph, color));
-    }
-    return ret;
-}
-
-using id_thread_table = std::multimap<ccl::device_index_type, size_t /*thread id*/>;
-
-//TODO use inheritance or policy for indexers!!!
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct graph_ring_indexer {
-    graph_ring_indexer(std::vector<marked_idx>& id_ring_vector,
-                       id_thread_table& thread_id_storage,
-                       size_t thread_id,
-                       specific_indexed_device_storage& device_topology)
-            : id_array(id_ring_vector),
-              topology(device_topology),
-              assigned_ids(thread_id_storage),
-              thread_idx(thread_id) {}
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& container) {
-        // fill device group by rank sequencially
-        indexed_device_container<device_t>& indexed_container =
-            std::get<device_t::type_idx()>(topology);
-        for (auto& gpu_device : container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id](marked_idx& val) {
-                if (!val.first) //non marked
-                {
-                    return val.second == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                throw std::logic_error(std::string("Unknown device in id ring vector: ") +
-                                       ccl::to_string(id));
-            }
-            int rank = std::distance(id_array.begin(), it);
-
-            size_t already_assigned_ids_count = assigned_ids.count(id);
-            //rank += already_assigned_ids_count; TODO
-            (void)already_assigned_ids_count;
-
-            size_t size = id_array.size();
-            gpu_device->template reset_rank<group_id, class_id>(rank, size);
-            indexed_container.insert({ rank, gpu_device });
-
-            assigned_ids.insert({ id, thread_idx });
-
-            it->first = true; //marked
-        }
-    }
-
-protected:
-    std::vector<marked_idx>& id_array;
-    specific_indexed_device_storage& topology;
-
-    id_thread_table& assigned_ids;
-    size_t thread_idx;
-    size_t ring_id_offset;
-};
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct colored_graph_ring_indexer {
-    static constexpr color_t marked_color = std::numeric_limits<color_t>::max();
-
-    colored_graph_ring_indexer(colored_plain_graph& id_ring_vector,
-                               size_t thread_id,
-                               color_t process_id,
-                               specific_indexed_device_storage& device_topology,
-                               size_t r_offset = 0,
-                               size_t s_offset = 0,
-                               size_t i_offset = 0)
-            : id_array(id_ring_vector),
-              topology(device_topology),
-              thread_idx(thread_id),
-              process_idx(process_id),
-              rank_offset(r_offset),
-              size_offset(s_offset),
-              index_offset(i_offset),
-              marked_indices_count() {}
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& in_container) {
-        indexed_device_container<device_t>& out_container =
-            std::get<device_t::type_idx()>(topology);
-
-        // fill device group by rank sequencially
-        for (auto& gpu_device : in_container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id, this](colored_idx& val) {
-                if (val.color == process_idx) // find in my process
-                {
-                    return val.index == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                continue;
-                /*
-                throw std::logic_error(std::string("Unknown device in id ring vector: ") +
-                                       ccl::to_string(id) + ". ring vector:\n" +
-                                       to_string(id_array));
-                                       */
-            }
-
-            //rank in local graph_ring
-            int rank = std::distance(id_array.begin(), it);
-            size_t size = id_array.size();
-
-            //apply offsets
-            gpu_device->template reset_rank<group_id, class_id>(rank + rank_offset,
-                                                                size + size_offset);
-            out_container.insert({ rank + index_offset, gpu_device });
-
-            it->color = marked_color; //marked
-            marked_indices_count++;
-        }
-    }
-
-    size_t get_marked_indices_count() const {
-        return marked_indices_count;
-        ;
-    }
-
-protected:
-    colored_plain_graph& id_array;
-    specific_indexed_device_storage& topology;
-    size_t thread_idx;
-    size_t ring_id_offset;
-    color_t process_idx;
-    size_t rank_offset;
-    size_t size_offset;
-    size_t index_offset;
-    size_t marked_indices_count;
-};
-
-static constexpr color_t marked_color = std::numeric_limits<color_t>::max();
-inline void separate_ipc_devices(const ccl::process_device_indices_type& ipc_indices,
-                                 size_t process_idx,
-                                 size_t process_num,
-                                 const colored_plain_graph& id_array,
-                                 ccl::process_device_indices_type& ipc_src_indices,
-                                 ccl::process_device_indices_type& ipc_dst_indices,
-                                 color_t exclude_color = marked_color) {
-    // find right ipcs
-    do {
-        auto graph_it =
-            std::find_if(id_array.begin(), id_array.end(), [process_idx](const colored_idx& val) {
-                return val.color == process_idx;
-            });
-        if (graph_it == id_array.end()) {
-            /* throw std::runtime_error(
-                std::string(__FUNCTION__) + " - unexpected graph for process: " +
-                std::to_string(process_idx) + ". Graph: \n" + to_string(id_array));
-            */
-            // nothing to do
-            return;
-        }
-
-        // calc IPC process Index
-        size_t ipc_process_index_to_find = process_idx + 1;
-
-        size_t actual_ipc_process_index = ipc_process_index_to_find;
-        /*
-        if (process_idx == process_num - 1) {
-            //replace terminator as index for right
-            actual_ipc_process_index = 0;
-        }
-        */
-
-        // find  first IPC device
-        graph_it = std::find_if(
-            graph_it, id_array.end(), [ipc_process_index_to_find](const colored_idx& val) {
-                return val.color == ipc_process_index_to_find;
-            });
-
-        if (graph_it == id_array.end()) {
-            break;
-        }
-
-        //test on ipc filter
-        auto candidate_it = ipc_indices.find(actual_ipc_process_index);
-        if (candidate_it == ipc_indices.end() or
-            (candidate_it->second.find(graph_it->index) == candidate_it->second.end())) {
-            break;
-        }
-
-        //remember
-        ipc_dst_indices.insert({ graph_it->color, { graph_it->index } });
-    } while (false);
-
-    //find left ipc
-    do {
-        auto graph_rit =
-            std::find_if(id_array.rbegin(), id_array.rend(), [process_idx](const colored_idx& val) {
-                return val.color == process_idx;
-            });
-        if (graph_rit == id_array.rend()) {
-            assert(false && "Invalide configuration: not my graph from left");
-            throw std::runtime_error(
-                std::string(__FUNCTION__) +
-                " - unexpected graph (left) for process: " + std::to_string(process_idx));
-        }
-
-        // calc IPC process Index
-        size_t ipc_process_index_to_find = process_idx - 1;
-        if (process_idx == 0) {
-            //replace terminator as index for left
-            ipc_process_index_to_find = process_num;
-        }
-
-        // find  first IPC device
-        graph_rit = std::find_if(
-            graph_rit, id_array.rend(), [ipc_process_index_to_find](const colored_idx& val) {
-                return val.color == ipc_process_index_to_find;
-            });
-
-        if (graph_rit == id_array.rend()) {
-            break;
-        }
-
-        //test on ipc filter
-        auto candidate_it = ipc_indices.find(ipc_process_index_to_find);
-        if (candidate_it == ipc_indices.end() or
-            (candidate_it->second.find(graph_rit->index) == candidate_it->second.end())) {
-            break;
-        }
-
-        graph_rit = std::prev(graph_rit); // use my device to upgrade as IPC source
-
-        //remember
-        ipc_src_indices.insert({ graph_rit->color, { graph_rit->index } });
-    } while (false);
-}
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class context_t>
-struct smart_ring_indexer {
-    static constexpr color_t marked_color = std::numeric_limits<color_t>::max();
-
-    smart_ring_indexer(colored_plain_graph& id_ring_vector,
-                       color_t process_id,
-                       size_t process_count,
-                       size_t process_device_rank_offset,
-                       size_t process_device_size_offset,
-                       device_storage& device_factory,
-                       specific_indexed_device_storage& device_topology,
-                       const ccl::process_device_indices_type& ipc_device,
-                       const ccl::process_device_indices_type& scaleout_device_indices,
-                       typename colored_plain_graph::iterator local_proc_ring_it,
-                       context_t& parent_ctx)
-            : id_array(id_ring_vector),
-              process_idx(process_id),
-              process_num(process_count),
-              device_index_offset(process_device_rank_offset),
-              device_size_offset(process_device_size_offset),
-              factory(device_factory),
-              topology(device_topology),
-              ipc_src_indices(),
-              ipc_dst_indices(),
-              scaleout_indices(scaleout_device_indices),
-              marked_indices_count(),
-              context(parent_ctx) {
-        separate_ipc_devices(
-            ipc_device, process_idx, process_num, id_array, ipc_src_indices, ipc_dst_indices);
-
-        //offset used for calculation opertional rank & size
-        local_proc_color_it = local_proc_ring_it;
-    }
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& in_container) {
-        // fill device group by rank sequencially
-        for (auto& gpu_device : in_container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id, this](colored_idx& val) {
-                if (val.color == process_idx) // find for current process
-                {
-                    return val.index == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                /*throw std::logic_error(std::string("Unknown device in id ring vector: ") +
-                                       ccl::to_string(id) + ". ring vector:\n" +
-                                       to_string(id_array));
-                */
-                continue;
-            }
-
-            // calculate operation rank using from current process color position 'local_proc_color_it'
-            int rank = std::distance(local_proc_color_it, it);
-            size_t size = id_array.size();
-
-            size -= device_size_offset;
-
-            //Check on IPC source candidate at first
-            //First device in ring will be upgraded to IPC source, if reflected IPC devices for another process is existing
-            if (!try_as_ipc_source(gpu_device, rank, device_index_offset, size)) {
-                // regular device
-                gpu_device->template reset_rank<group_id, class_id>(rank, size);
-
-                indexed_device_container<device_t>& out_container =
-                    std::get<device_t::type_idx()>(topology);
-                out_container.insert({ rank + device_index_offset, gpu_device });
-            }
-
-            // mark as processed in id_ring
-            it->color = marked_color;
-            marked_indices_count++;
-        }
-    }
-
-    void operator()(plain_device_container<ccl_ipc_gpu_comm>& in_container) {
-        //Insert IPC destination device
-        indexed_device_container<ccl_ipc_gpu_comm>& out_container =
-            std::get<ccl_ipc_gpu_comm::type_idx()>(topology);
-        for (auto& gpu_device : in_container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            size_t foreign_process_idx = (process_idx + 1) % process_num;
-            if (process_idx == process_num - 1) {
-                foreign_process_idx = process_num; //use terminator
-            }
-            auto it = std::find_if(
-                id_array.begin(), id_array.end(), [id, foreign_process_idx](colored_idx& val) {
-                    if (val.color == foreign_process_idx) // find in my process
-                    {
-                        return val.index == id;
-                    }
-                    return false;
-                });
-            if (it == id_array.end()) {
-                /*
-                throw std::logic_error(std::string("Unknown device in id ring vector: ") +
-                                       ccl::to_string(id) + ". ring vector:\n" +
-                                       to_string(id_array));
-                         */
-                continue;
-            }
-
-            //rank in local graph_ring
-            int rank = std::distance(local_proc_color_it, it);
-            //size_t rank = std::distance(id_array.begin(), it);
-            size_t size = id_array.size();
-            size -= device_size_offset; //ipc_src_indices.size();
-
-            // limit rank on edge of ring
-            if (foreign_process_idx == process_num) {
-                rank = (rank + device_index_offset) % size;
-                //apply offsets
-                gpu_device->template reset_rank<group_id, class_id>(rank, size);
-                gpu_device->template reassign_with_addr<group_id, class_id>(rank);
-
-                out_container.insert({ rank /* + device_index_offset*/, gpu_device });
-            }
-            else {
-                //no offsets
-                gpu_device->template reset_rank<group_id, class_id>(rank, size);
-                gpu_device->template reassign_with_addr<group_id, class_id>(rank);
-
-                out_container.insert({ rank + device_index_offset, gpu_device });
-            }
-
-            // mark as processed in id_ring
-            it->color = marked_color;
-            marked_indices_count++;
-        }
-    }
-
-    size_t get_marked_indices_count() const {
-        return marked_indices_count;
-    }
-
-protected:
-    colored_plain_graph& id_array;
-    color_t process_idx;
-    size_t process_num;
-    size_t device_index_offset;
-    size_t device_size_offset;
-    device_storage& factory;
-    specific_indexed_device_storage& topology;
-    ccl::process_device_indices_type ipc_src_indices;
-    ccl::process_device_indices_type ipc_dst_indices;
-    const ccl::process_device_indices_type& scaleout_indices;
-    size_t marked_indices_count;
-
-    context_t& context;
-    typename colored_plain_graph::iterator local_proc_color_it;
-
-private:
-    template <class device_t>
-    bool try_as_ipc_source(std::shared_ptr<device_t> gpu_device,
-                           int rank,
-                           int process_offset,
-                           size_t size) {
-        //concurrent device is not IPC source
-        return false;
-    }
-
-    bool try_as_ipc_source(std::shared_ptr<ccl_gpu_comm> gpu_device,
-                           int rank,
-                           int process_offset,
-                           size_t size) {
-        return try_as_ipc_source_impl(gpu_device, rank, process_offset, size);
-    }
-
-    bool try_as_ipc_source(std::shared_ptr<ccl_virtual_gpu_comm> gpu_device,
-                           int rank,
-                           int process_offset,
-                           size_t size) {
-        return try_as_ipc_source_impl(gpu_device, rank, process_offset, size);
-    }
-
-    template <class device_t>
-    bool try_as_ipc_source_impl(std::shared_ptr<device_t> gpu_device,
-                                int rank,
-                                int process_offset,
-                                size_t size) {
-        //Check on IPC source candidate at first
-        const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-        auto process_set = ipc_src_indices.find(process_idx);
-        if (process_set == ipc_src_indices.end() or
-            process_set->second.find(id) == process_set->second.end()) {
-            return false;
-        }
-
-        //set rank before upgrade
-        gpu_device->template reset_rank<group_id, class_id>(rank + process_offset, size);
-
-        // ipc device
-        using ipc_device_t = ccl_ipc_source_gpu_comm<device_t>;
-        device_t_ptr<ipc_device_t> new_ipc_source_comm = factory.create_gpu_device<ipc_device_t>(
-            gpu_device->get_device(), rank + process_offset, *gpu_device, group_id, class_id);
-
-        new_ipc_source_comm->template assign<group_id, class_id>(context, context.get_ipc_ctx());
-        new_ipc_source_comm->template reassign_with_addr<group_id, class_id>(rank + process_offset);
-
-        //new_ipc_source_comm->template reset_rank<group_id, class_id>(rank, size);
-        indexed_device_container<ipc_device_t>& out_ipc_container =
-            std::get<ipc_device_t::type_idx()>(topology);
-        out_ipc_container.insert({ rank + process_offset, new_ipc_source_comm });
-
-        //remove ipc_index
-        auto rem_it = process_set->second.find(id);
-        process_set->second.erase(rem_it);
-        return true;
-    }
-};
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct graph_ring_indexer_ext : public graph_ring_indexer<group_id, class_id> {
-    using base = graph_ring_indexer<group_id, class_id>;
-    using base::topology;
-    using base::thread_idx;
-    using base::id_array;
-    using base::assigned_ids;
-    graph_ring_indexer_ext(std::vector<marked_idx>& id_ring_vector,
-                           id_thread_table& thread_id_storage,
-                           size_t thread_id,
-                           specific_indexed_device_storage& device_topology,
-                           size_t index_offset_val,
-                           size_t rank_offset_val,
-                           size_t size_offset_val)
-            : graph_ring_indexer<group_id, class_id>(id_ring_vector,
-                                                     thread_id_storage,
-                                                     thread_id,
-                                                     device_topology),
-              index_offset(index_offset_val),
-              rank_offset(rank_offset_val),
-              size_offset(size_offset_val) {}
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& container) {
-        // fill device group by rank sequencially
-        indexed_device_container<device_t>& indexed_container =
-            std::get<device_t::type_idx()>(topology);
-        for (auto& gpu_device : container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id](marked_idx& val) {
-                if (!val.first) //non marked
-                {
-                    return val.second == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                throw std::logic_error(std::string("Unknown device in id ring vector: ") +
-                                       ccl::to_string(id));
-            }
-            int rank = std::distance(id_array.begin(), it);
-            rank += rank_offset;
-
-            size_t already_assigned_ids_count = assigned_ids.count(id);
-            //rank += already_assigned_ids_count; TODO
-            (void)already_assigned_ids_count;
-
-            size_t size = id_array.size();
-            size = size_offset;
-
-            gpu_device->template reset_rank<group_id, class_id>(rank, size);
-            indexed_container.insert({ rank + index_offset, gpu_device });
-
-            assigned_ids.insert({ id, thread_idx });
-
-            it->first = true; //marked
-            marked_indices_count++;
-        }
-    }
-
-    size_t get_marked_indices_count() const {
-        return marked_indices_count;
-        ;
-    }
-
-private:
-    size_t index_offset;
-    size_t rank_offset;
-    size_t size_offset;
-
-    size_t marked_indices_count;
-};
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct graph_ring_indexer_unique_index : public graph_ring_indexer<group_id, class_id> {
-    using base = graph_ring_indexer<group_id, class_id>;
-    using base::topology;
-    using base::thread_idx;
-    using base::id_array;
-    using base::assigned_ids;
-    graph_ring_indexer_unique_index(std::vector<marked_idx>& id_ring_vector,
-                                    id_thread_table& thread_id_storage,
-                                    size_t thread_id,
-                                    specific_indexed_device_storage& device_topology,
-                                    size_t index_offset_val,
-                                    size_t rank_offset_val,
-                                    size_t size_offset_val)
-            : graph_ring_indexer<group_id, class_id>(id_ring_vector,
-                                                     thread_id_storage,
-                                                     thread_id,
-                                                     device_topology),
-              index_offset(index_offset_val),
-              rank_offset(rank_offset_val),
-              size_offset(size_offset_val),
-              marked_indices_count() {}
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& container) {
-        // fill device group by rank sequencially
-        indexed_device_container<device_t>& indexed_container =
-            std::get<device_t::type_idx()>(topology);
-        for (auto& gpu_device : container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id](marked_idx& val) {
-                if (!val.first) //non marked
-                {
-                    return val.second == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                continue;
-            }
-
-            int rank = std::distance(id_array.begin(), it);
-            size_t already_assigned_ids_count = assigned_ids.count(id);
-            //rank += already_assigned_ids_count; TODO
-            (void)already_assigned_ids_count;
-
-            size_t size = id_array.size();
-            size += size_offset;
-
-            gpu_device->template reset_rank<group_id, class_id>(rank, size);
-            indexed_container.insert({ rank + index_offset, gpu_device });
-
-            assigned_ids.insert({ id, thread_idx });
-
-            it->first = true; //marked
-            marked_indices_count++;
-        }
-    }
-
-    size_t get_marked_indices_count() const {
-        return marked_indices_count;
-        ;
-    }
-
-private:
-    size_t index_offset;
-    size_t rank_offset;
-    size_t size_offset;
-
-    size_t marked_indices_count;
-};
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-struct graph_ring_indexer_unique_index_ext : public graph_ring_indexer<group_id, class_id> {
-    using base = graph_ring_indexer<group_id, class_id>;
-    using base::topology;
-    using base::thread_idx;
-    using base::id_array;
-    using base::assigned_ids;
-    graph_ring_indexer_unique_index_ext(std::vector<marked_idx>& id_ring_vector,
-                                        id_thread_table& thread_id_storage,
-                                        size_t thread_id,
-                                        specific_indexed_device_storage& device_topology,
-                                        size_t index_offset_val,
-                                        size_t rank_offset_val,
-                                        size_t size_offset_val)
-            : graph_ring_indexer<group_id, class_id>(id_ring_vector,
-                                                     thread_id_storage,
-                                                     thread_id,
-                                                     device_topology),
-              index_offset(index_offset_val),
-              rank_offset(rank_offset_val),
-              size_offset(size_offset_val),
-              marked_indices_count() {}
-
-    template <class device_t>
-    void operator()(plain_device_container<device_t>& container) {
-        // fill device group by rank sequencially
-        indexed_device_container<device_t>& indexed_container =
-            std::get<device_t::type_idx()>(topology);
-        for (auto& gpu_device : container) {
-            //get device id from group
-            const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
-
-            //find rank for device id in our ring
-            auto it = std::find_if(id_array.begin(), id_array.end(), [id](marked_idx& val) {
-                if (!val.first) //non marked
-                {
-                    return val.second == id;
-                }
-                return false;
-            });
-            if (it == id_array.end()) {
-                continue;
-            }
-
-            int rank = std::distance(id_array.begin(), it);
-            rank += rank_offset;
-
-            size_t already_assigned_ids_count = assigned_ids.count(id);
-            //rank += already_assigned_ids_count; TODO
-            (void)already_assigned_ids_count;
-
-            size_t size = id_array.size();
-            size += size_offset;
-
-            gpu_device->template reset_rank<group_id, class_id>(rank, size);
-            indexed_container.insert({ rank + index_offset, gpu_device });
-
-            assigned_ids.insert({ id, thread_idx });
-
-            it->first = true; //marked
-            marked_indices_count++;
-        }
-    }
-
-    size_t get_marked_indices_count() const {
-        return marked_indices_count;
-        ;
-    }
-
-private:
-    size_t index_offset;
-    size_t rank_offset;
-    size_t size_offset;
-
-    size_t marked_indices_count;
-};
-
-template <class device_t, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-std::tuple<size_t, device_t_ptr<device_t>> get_device_with_min_rank(
-    const specific_indexed_device_storage& indexed_devices,
-    const plain_graph& id_ring) {
-    const indexed_device_container<device_t>& container =
-        std::get<device_t::type_idx()>(indexed_devices);
-
-    //search in map from end (max element)
-    size_t idx = std::numeric_limits<size_t>::min();
-    device_t_ptr<device_t> dev;
-    for (auto it = container.rbegin(); it != container.rend(); ++it) {
-        device_t_ptr<device_t> tmp_dev = it->second;
-        const auto& path = tmp_dev->get_device().get_device_path();
-        if (std::find(id_ring.begin(), id_ring.end(), path) != id_ring.end()) {
-            idx = tmp_dev->template get_comm_data<group_id, class_id>().rank;
-            dev = tmp_dev;
-            break;
-        }
-    }
-    return std::tuple<size_t, device_t_ptr<device_t>>{ idx, dev };
-}
-
-template <class device_t, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-std::tuple<size_t, device_t_ptr<device_t>> get_device_with_min_rank(
-    const specific_indexed_device_storage& indexed_devices,
-    const colored_plain_graph& id_ring) {
-    const indexed_device_container<device_t>& container =
-        std::get<device_t::type_idx()>(indexed_devices);
-
-    //search in map from end (max element)
-    size_t idx = std::numeric_limits<size_t>::min();
-    device_t_ptr<device_t> dev;
-    for (auto it = container.rbegin(); it != container.rend(); ++it) {
-        device_t_ptr<device_t> tmp_dev = it->second;
-        const auto& path = tmp_dev->get_device().get_device_path();
-        if (std::find_if(id_ring.begin(),
-                         id_ring.end(),
-                         [path](const typename colored_plain_graph::value_type& val) {
-                             return val.index == path;
-                         }) != id_ring.end()) {
-            idx = tmp_dev->template get_comm_data<group_id, class_id>().rank;
-            dev = tmp_dev;
-            break;
-        }
-    }
-    return std::tuple<size_t, device_t_ptr<device_t>>{ idx, dev };
-}
-
-template <class device_t, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-std::tuple<size_t, device_t_ptr<device_t>> get_device_with_max_rank(
-    const specific_indexed_device_storage& indexed_devices,
-    const plain_graph& id_ring) {
-    const indexed_device_container<device_t>& container =
-        std::get<device_t::type_idx()>(indexed_devices);
-
-    //search in map from begin (min element)
-    size_t idx = std::numeric_limits<size_t>::max();
-    device_t_ptr<device_t> dev;
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        device_t_ptr<device_t> tmp_dev = it->second;
-        const auto& path = tmp_dev->get_device().get_device_path();
-        if (std::find(id_ring.begin(), id_ring.end(), path) != id_ring.end()) {
-            idx = tmp_dev->template get_comm_data<group_id, class_id>().rank;
-            dev = tmp_dev;
-            break;
-        }
-    }
-    return std::tuple<size_t, device_t_ptr<device_t>>{ idx, dev };
-}
-
-template <class device_t, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-std::tuple<size_t, device_t_ptr<device_t>> get_device_with_max_rank(
-    const specific_indexed_device_storage& indexed_devices,
-    const colored_plain_graph& id_ring) {
-    const indexed_device_container<device_t>& container =
-        std::get<device_t::type_idx()>(indexed_devices);
-
-    //search in map from begin (min element)
-    size_t idx = std::numeric_limits<size_t>::max();
-    device_t_ptr<device_t> dev;
-    for (auto it = container.begin(); it != container.end(); ++it) {
-        device_t_ptr<device_t> tmp_dev = it->second;
-        const auto& path = tmp_dev->get_device().get_device_path();
-        if (std::find_if(id_ring.begin(),
-                         id_ring.end(),
-                         [path](const typename colored_plain_graph::value_type& val) {
-                             return val.index == path;
-                         }) != id_ring.end()) {
-            idx = tmp_dev->template get_comm_data<group_id, class_id>().rank;
-            dev = tmp_dev;
-            break;
-        }
-    }
-    return std::tuple<size_t, device_t_ptr<device_t>>{ idx, dev };
-}
-
-template <class device_t, ccl::group_split_type group_id, ccl::device_topology_type class_id>
-device_t_ptr<ccl_thread_comm<device_t>> add_concurrent_locker_device(
-    size_t next_rank,
-    size_t index_offset,
-    const std::tuple<size_t, device_t_ptr<device_t>>& dev_to_lock,
-    device_storage& device_factory,
-    specific_indexed_device_storage& storage_to_lock) {
-    device_t_ptr<device_t> dev = std::get<1>(dev_to_lock);
-    device_t_ptr<ccl_thread_comm<device_t>> new_concurrent_comm =
-        device_factory.create_gpu_device<ccl_thread_comm<device_t>>(
-            dev->get_device(), next_rank, *dev);
-
-    const auto& comm_addr = new_concurrent_comm->template get_comm_data<group_id, class_id>();
-
-    indexed_device_container<ccl_thread_comm<device_t>>& current_locker_map =
-        std::get<ccl_thread_comm<device_t>::type_idx()>(storage_to_lock);
-    current_locker_map.insert({ comm_addr.rank + index_offset, new_concurrent_comm });
-    return new_concurrent_comm;
-}
-
-//-S- IPC
-template <class device_t, ccl::group_split_type split_id, ccl::device_topology_type class_id>
-device_t_ptr<ccl_ipc_source_gpu_comm<device_t>> add_ipc_source_locker_device(
-    size_t next_rank,
-    size_t index_offset,
-    const std::tuple<size_t, device_t_ptr<device_t>>& dev_to_lock,
-    device_storage& device_factory,
-    specific_indexed_device_storage& storage_to_lock) {
-    using ipc_device_t = ccl_ipc_source_gpu_comm<device_t>;
-
-    device_t_ptr<device_t> dev = std::get<1>(dev_to_lock);
-    device_t_ptr<ipc_device_t> new_ipc_source_comm = device_factory.create_gpu_device<ipc_device_t>(
-        dev->get_device(), next_rank, *dev, split_id, class_id);
-
-    const auto& comm_addr = new_ipc_source_comm->template get_comm_data<split_id, class_id>();
-
-    indexed_device_container<ipc_device_t>& current_locker_map =
-        std::get<ipc_device_t::type_idx()>(storage_to_lock);
-
-    //Exchange old device_t with new ipc_device_t
-    indexed_device_container<device_t>& original_dev_container =
-        std::get<device_t::type_idx()>(storage_to_lock);
-    auto original_dev_it = original_dev_container.find(std::get<0>(dev_to_lock) + index_offset);
-    if (original_dev_it == original_dev_container.end() or
-        original_dev_it->second.get() != dev.get()) {
-        assert(false && "unexpected device");
-    }
-
-    current_locker_map.insert({ comm_addr.rank + index_offset, new_ipc_source_comm });
-    original_dev_container.erase(original_dev_it);
-
-    return new_ipc_source_comm;
-}
-
-using ipc_devices_pool = std::map<size_t /*rank*/, device_t_ptr<ccl_ipc_gpu_comm>>;
-
-//#if 0 OLD topology construction
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-inline ipc_devices_pool create_ipc_gpu_comms(id_thread_table assigned_ids_copy,
-                                             const plain_graph& id_ring,
-                                             device_storage& device_factory,
-                                             size_t size_override_value,
-                                             size_t rank_offset_value) {
-    // allocate IPC devices pool with rank from unassigned IDs
-    // need to find symmetric_difference between graph ids and assigned ids
-    // unassigned ids is a ipc device candidate
-
-    ipc_devices_pool ret;
-    for (auto graph_it = id_ring.begin(); graph_it != id_ring.end();) {
-        auto assigned_id_it = assigned_ids_copy.find(*graph_it);
-        if (assigned_id_it != assigned_ids_copy.end()) {
-            assigned_ids_copy.erase(assigned_id_it);
-            ++graph_it;
-            continue;
-        }
-
-        //find unassigned_device
-        int rank = std::distance(id_ring.begin(), graph_it);
-        size_t size = size_override_value;
-
-        //recalculate rank to apply offset for other processes count
-        rank = (rank + rank_offset_value) % size;
-
-        ccl_device_driver::device_ptr ipc_device = get_runtime_device(*graph_it);
-        device_t_ptr<ccl_ipc_gpu_comm> locker = device_factory.create_gpu_device<ccl_ipc_gpu_comm>(
-            *ipc_device, rank, size, group_id, class_id);
-        ret.insert({ rank, std::move(locker) });
-        ++graph_it;
-    }
-    return ret;
-}
-
-using cluster_ipc_devices_pool = std::map<size_t /*process_id*/, ipc_devices_pool>;
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-inline cluster_ipc_devices_pool create_filtered_ipc_gpu_comms(
-    const colored_plain_graph& id_ring,
-    const ccl::process_device_indices_type& ipc_indices,
-    size_t process_idx,
-    size_t process_size,
-    device_storage& device_factory) {
-    cluster_ipc_devices_pool ret;
-    for (auto graph_it = id_ring.begin(); graph_it != id_ring.end(); ++graph_it) {
-        if (graph_it->color != colored_graph_ring_indexer<group_id, class_id>::marked_color and
-            graph_it->color != process_idx) {
-            size_t ipc_process_index = graph_it->color;
-            if (process_idx == 0 and ipc_process_index > process_size) {
-                //replace terminator as index
-                ipc_process_index = process_size;
-            }
-
-            if (process_idx == process_size - 1 and ipc_process_index > process_size) {
-                //replace terminator as index
-                ipc_process_index = 0;
-            }
-            //find ipc_device in candidates list
-            auto candidate_it = ipc_indices.find(ipc_process_index);
-            if (candidate_it == ipc_indices.end() or
-                (candidate_it->second.find(graph_it->index) == candidate_it->second.end())) {
-                continue;
-            }
-
-            //device is IPC
-            int rank = std::distance(id_ring.begin(), graph_it);
-            size_t size = id_ring.size();
-
-            ccl_device_driver::device_ptr ipc_device = get_runtime_device(graph_it->index);
-            device_t_ptr<ccl_ipc_gpu_comm> locker =
-                device_factory.create_gpu_device<ccl_ipc_gpu_comm>(
-                    *ipc_device, rank, size, group_id, class_id);
-            ret[graph_it->color].insert({ rank, std::move(locker) });
-        }
-    }
-    return ret;
-}
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class context>
-inline cluster_ipc_devices_pool create_filtered_ipc_destination_gpu_comms(
-    const colored_plain_graph& id_ring,
-    const ccl::process_device_indices_type& ipc_indices,
-    size_t process_idx,
-    size_t process_size,
-    context& context_to_register,
-    device_storage& device_factory,
-    specific_plain_device_storage& out_container) {
-    //destination is right device with max color
-    cluster_ipc_devices_pool ret;
-
-    //find beginning point
-    auto graph_it =
-        std::find_if(id_ring.begin(), id_ring.end(), [process_idx](const colored_idx& val) {
-            return val.color == process_idx;
-        });
-
-    if (graph_it == id_ring.end()) {
-        abort();
-    }
-
-    //find IPC device in increasing process order
-    for (; graph_it != id_ring.end(); ++graph_it) {
-        if (graph_it->color != colored_graph_ring_indexer<group_id, class_id>::marked_color and
-            graph_it->color > process_idx) {
-            size_t ipc_process_index = graph_it->color;
-
-            if (!ret.empty()) {
-                // 1 device in enough for ring
-                continue;
-            }
-            /*
-            if ((process_idx == process_size - 1) and ipc_process_index >= process_size) {
-                //replace terminator as index
-                ipc_process_index = 0;
-            }
-            */
-
-            //find ipc_device in candidates list
-            auto candidate_it = ipc_indices.find(ipc_process_index);
-            if (candidate_it == ipc_indices.end() or
-                (candidate_it->second.find(graph_it->index) == candidate_it->second.end())) {
-                continue;
-            }
-
-            //device is IPC
-            int rank = std::distance(id_ring.begin(), graph_it);
-            size_t size = id_ring.size();
-
-            ccl_device_driver::device_ptr ipc_device = get_runtime_device(graph_it->index);
-            device_t_ptr<ccl_ipc_gpu_comm> locker =
-                device_factory.create_gpu_device<ccl_ipc_gpu_comm>(
-                    *ipc_device, rank, size, group_id, class_id);
-
-            locker->template assign<group_id, class_id>(context_to_register,
-                                                        context_to_register.get_ipc_ctx());
-
-            std::get<ccl_ipc_gpu_comm::type_idx()>(out_container).push_back(locker);
-            ret[graph_it->color].insert({ rank, std::move(locker) });
-        }
-    }
-    return ret;
-}
-
-template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-inline cluster_ipc_devices_pool create_ipc_gpu_comms(const colored_plain_graph& id_ring,
-                                                     size_t process_idx,
-                                                     device_storage& device_factory,
-                                                     size_t size_override_value,
-                                                     size_t rank_offset_value) {
-    cluster_ipc_devices_pool ret;
-    for (auto graph_it = id_ring.begin(); graph_it != id_ring.end(); ++graph_it) {
-        if (graph_it->color != colored_graph_ring_indexer<group_id, class_id>::marked_color and
-            graph_it->color != process_idx) {
-            int rank = std::distance(id_ring.begin(), graph_it);
-            size_t size = size_override_value;
-
-            //recalculate rank to apply offset for other processes count
-            rank = (rank + rank_offset_value) % size;
-
-            ccl_device_driver::device_ptr ipc_device = get_runtime_device(graph_it->index);
-            device_t_ptr<ccl_ipc_gpu_comm> locker =
-                device_factory.create_gpu_device<ccl_ipc_gpu_comm>(
-                    *ipc_device, rank, size, group_id, class_id);
-            ret[graph_it->color].insert({ rank, std::move(locker) });
-        }
-    }
-    return ret;
-}
-
-template <ccl::group_split_type topology>
-inline cluster_ipc_devices_pool create_ipc_gpu_comms(const colored_plain_graph_list& list,
-                                                     size_t process_idx,
-                                                     device_storage& device_factory,
-                                                     size_t size_override_value,
-                                                     size_t rank_offset_value) {
-    cluster_ipc_devices_pool ret;
-    for (const auto& graph : list) {
-        auto graph_ret = create_ipc_gpu_comms<topology>(
-            graph, process_idx, device_factory, size_override_value, rank_offset_value);
-        ret.insert(graph_ret.begin(), graph_ret.end());
-    }
-    return ret;
-}
-
-inline std::vector<size_t> get_ipc_proceses(const cluster_ipc_devices_pool& ipc_comms,
-                                            size_t process_index,
-                                            size_t process_count) {
-    std::vector<size_t> ipc_processes_id;
-    ipc_processes_id.reserve(ipc_comms.size());
-    for (auto it = ipc_comms.begin(); it != ipc_comms.end(); ++it) {
-        if (it->first != process_index) {
-            ipc_processes_id.push_back(it->first);
-        }
-    }
-    return ipc_processes_id;
-}
-//#endif //OLD topology construction
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp
deleted file mode 100644
index cae86d757..000000000
--- a/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/topology/ring/ring_construction_utils.hpp"
-#include "common/comm/l0/topology/ring/thread_group_ring_creator.hpp"
-
-namespace native {
-
-thread_group_ring_topology::thread_group_ring_topology(thread_group_context& ctx,
-                                                       device_storage& devs)
-        : context(ctx),
-          devices_factory(devs) {}
-
-size_t thread_group_ring_topology::default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                                          const ccl_device& rhs) {
-    return detail::property_p2p_rating_calculator(lhs, rhs, THREAD_GROUP_WEIGHT);
-}
-
-detail::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-    detail::p2p_rating_function ping) {
-    ccl::process_device_indices_type per_thread_device_indices;
-    for (const auto& mask : per_thread_device_masks) {
-        per_thread_device_indices.insert(
-            { mask.first, ccl_device_driver::get_device_indices(mask.second) });
-    }
-
-    return build_p2p_capability_matrix(out, per_thread_device_indices, ping);
-}
-
-detail::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
-    std::ostream& out,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    detail::p2p_rating_function ping) {
-    // Build adjacency matrix with P2P capability:
-    // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
-    // element values - is a weight of P2P activity: 0 means - devices are not connected
-    // If values is not 0 - than two devies can be combined together
-
-    detail::adjacency_matrix ring_p2p_matrix;
-    if (per_thread_device_indices.empty()) {
-        out << "No indices - nothing to build" << std::endl;
-        return ring_p2p_matrix;
-    }
-
-    out << "Build adjacency matrix by: " << thread_group_ring_topology::name()
-        << " - threads count: " << per_thread_device_indices.size() << std::endl;
-
-    ccl::device_indices_type aggregated_thread_indices = std::accumulate(
-        per_thread_device_indices.begin(),
-        per_thread_device_indices.end(),
-        ccl::device_indices_type(),
-        [](ccl::device_indices_type& partial_mask,
-           const std::pair<size_t, ccl::device_indices_type>& thread_mask) {
-            partial_mask.insert(thread_mask.second.begin(), thread_mask.second.end());
-            return partial_mask;
-        });
-    out << "Create devices for aggregared thread indices count: "
-        << aggregated_thread_indices.size() << std::endl;
-    for (const auto& ind : aggregated_thread_indices) {
-        out << ind << ", ";
-    }
-    out << std::endl;
-
-    return get_platform().calculate_device_access_metric(aggregated_thread_indices, ping);
-}
-
-bool thread_group_ring_topology::build(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const detail::adjacency_matrix& matrix,
-    detail::p2p_rating_function ping) {
-    out << "\n/*************\"" << thread_group_ring_topology::name()
-        << "\" for threads: " << context.thread_device_topology.size() << "*************/\n"
-        << std::endl;
-
-    out << "Resolve device graph: " << std::endl;
-    detail::plain_graph_list id_rings =
-        graph_list_resolver(matrix, per_thread_device_indices, ping);
-
-    size_t size = id_rings.size();
-    out << "Resolved graphs count: " << size << "\n";
-    if (!size) {
-        out << "Cannot build any ring" << std::endl;
-        return false;
-    }
-    else if (id_rings.size() == 1) // whole ring
-    {
-        return build_specific(out, comm_addr, per_thread_device_indices, *id_rings.begin());
-    }
-
-    //torn-apart ring
-    return build_scale_up_specific(out, comm_addr, per_thread_device_indices, id_rings);
-}
-
-bool thread_group_ring_topology::build(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-    const detail::adjacency_matrix& matrix,
-    detail::p2p_rating_function ping) {
-    ccl::process_device_indices_type per_thread_device_indices;
-    for (const auto& mask : per_thread_device_masks) {
-        per_thread_device_indices.insert(
-            { mask.first, ccl_device_driver::get_device_indices(mask.second) });
-    }
-
-    return build(out, comm_addr, per_thread_device_indices, matrix);
-}
-
-bool thread_group_ring_topology::build_specific(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_type& per_thread_device_indices,
-    const detail::plain_graph& id_ring) {
-    size_t ring_index = 0;
-    constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
-
-    out << "Start building topology: " << ::to_string(class_id) << ", for graph:\n";
-    out << detail::to_string(id_ring);
-
-    // id_ring - inter-thread ring
-    out << "\nStart indexer:" << std::endl;
-    detail::id_thread_table assigned_ids; //device_id -> thread_id
-    auto& ctx_per_thread_data = context.thread_device_topology;
-    std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(id_ring); // marked graph
-
-    auto topology_comm_addr = comm_addr;
-    topology_comm_addr.comm_size = marked_id_ring.size();
-
-    for (auto per_thread_it = ctx_per_thread_data.begin();
-         per_thread_it != ctx_per_thread_data.end();
-         ++per_thread_it) {
-        size_t thread_id = per_thread_it->first; // first
-
-        //prepared empty topology
-        auto out_indexed_devices = std::make_shared<device_community<class_id>>(topology_comm_addr);
-        std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-            devices_factory.thread_gpu_comms.find(thread_id)->second;
-
-        // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
-        auto rank_builder = create_device_functor<detail::graph_ring_indexer<group_id(), class_id>>(
-            marked_id_ring, assigned_ids, thread_id, out_indexed_devices->get_device_storage());
-        ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-        detail::printer<group_id(), class_id> p;
-        ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
-        out << "Indexer result for devices in thread idx (" << thread_id << "/"
-            << ctx_per_thread_data.size() << "):\n"
-            << p.to_string() << std::endl;
-
-        //remember topology
-        context.get_thread_topology<class_id>(thread_id).set_topology(out_indexed_devices);
-    }
-
-    out << "\nStart ring builder" << std::endl;
-    for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-         current_thread_idx++) {
-        // find max rank in current thread device list
-        auto& indexed_devices_for_current_thread =
-            context.get_thread_topology<class_id>(current_thread_idx)
-                .get_topology(ring_index)
-                ->get_device_storage();
-        const auto& curr_real =
-            detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
-                indexed_devices_for_current_thread, id_ring);
-        const auto& curr_virt =
-            detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                indexed_devices_for_current_thread, id_ring);
-
-        size_t tg_max_rank = std::max({ std::get<0>(curr_real), std::get<0>(curr_virt) });
-
-        // find thread, which will connect to current thread max rank with next_rank
-        size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-
-        out << "Current thread: " << current_thread_idx
-            << ", max rank candidates: " << std::get<0>(curr_real) << ", " << std::get<0>(curr_virt)
-            << ", selected max rank: " << tg_max_rank << ", expected next_rank: " << next_rank
-            << std::endl;
-
-        for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-             next_thread_id++) {
-            if (next_thread_id == current_thread_idx) {
-                // wrong thread, get next
-                continue;
-            }
-
-            // search next_rank in that thread
-            auto& next_thread_ring_topology = context.get_thread_topology<class_id>(next_thread_id)
-                                                  .get_topology(ring_index)
-                                                  ->get_device_storage();
-            const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                next_thread_ring_topology, id_ring);
-            const auto& virt =
-                detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                    next_thread_ring_topology, id_ring);
-
-            if (next_rank != std::min({ std::get<0>(real), std::get<0>(virt) })) {
-                // wrong thread, get next
-                continue;
-            }
-
-            out << "next thread: " << next_thread_id
-                << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                << std::endl;
-
-            out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                << ")" << std::endl;
-            if (next_rank == std::get<0>(real)) {
-                auto locker =
-                    detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
-                        next_rank, 0, real, devices_factory, indexed_devices_for_current_thread);
-                out << "Added real locker by index: " << next_rank
-                    << ", for thread idx: " << current_thread_idx << ":\n"
-                    << locker->to_string() << std::endl;
-            }
-            else if (next_rank == std::get<0>(virt)) {
-                auto locker = detail::
-                    add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                        next_rank, 0, virt, devices_factory, indexed_devices_for_current_thread);
-                out << "Added virtual locker by index: " << next_rank
-                    << ", for thread idx: " << current_thread_idx << ":\n"
-                    << locker->to_string() << std::endl;
-            }
-            else {
-                assert(false && "unknown device type");
-                std::ostringstream ss;
-                ss << out.rdbuf();
-                throw std::runtime_error(std::string(__FUNCTION__) +
-                                         " - unknown device type. Log:\n" + ss.str());
-            }
-        }
-    }
-    return true;
-}
-
-bool thread_group_ring_topology::build_scale_up_specific(
-    std::ostream& out,
-    const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_type& per_thread_device_indicess,
-    const detail::plain_graph_list& graph_list) {
-    size_t ring_index = 0;
-    constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
-
-    out << "Start building topology: " << ::to_string(class_id)
-        << ", for graphs: " << graph_list.size() << "\n";
-    out << detail::to_string(graph_list);
-
-    auto& ctx_per_thread_data = context.thread_device_topology;
-    (void)ctx_per_thread_data;
-
-    out << "\nStart gpu comm transformation for graph list count: " << graph_list.size()
-        << std::endl;
-
-    std::set<ccl::device_index_type> created_scaleup_indices;
-
-    // let's start scale-up devices search & creation
-    for (const auto& id_ring : graph_list) {
-        for (const auto& per_thread : per_thread_device_indicess) {
-            size_t thread_id = per_thread.first;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices_factory.thread_gpu_comms.find(thread_id)->second;
-
-            // promote real-virtual device (right corner devices) in graphs up to scale_up_proxy type
-            // all loca group devices in different graph would be linked by scale_up_proxy
-            // each local group ( in graph) must have at least one scale_up_proxy device
-            const ccl::device_index_type& last_in_graph_index = *id_ring.rbegin();
-            if (per_thread.second.find(last_in_graph_index) != per_thread.second.end()) {
-                out << "thread: " << thread_id
-                    << " wants to create scale_up device by idx: " << last_in_graph_index
-                    << std::endl;
-                if (created_scaleup_indices.find(last_in_graph_index) !=
-                    created_scaleup_indices.end()) {
-                    out << "skip existing scale_up device candidate by: " << last_in_graph_index
-                        << std::endl;
-                    continue;
-                }
-
-                size_t inserted_device_type_index = detail::role_mod::inject_numa_device<
-                    group_id(),
-                    class_id,
-                    thread_group_context,
-                    ccl_virtual_gpu_comm, /* `virtual` is better candiate*/
-                    ccl_gpu_comm>(
-                    *non_indexed_plain_devices, last_in_graph_index, context, devices_factory);
-                if (inserted_device_type_index == std::numeric_limits<size_t>::max()) {
-                    assert(false && "Unsupported device type in topology creation");
-                    std::ostringstream ss;
-                    ss << out.rdbuf();
-
-                    //TODO no need to throw?
-                    throw std::runtime_error(
-                        std::string("Unsupported device type in topology creation. Log:\n") +
-                        ss.str());
-                }
-                out << "Inject numa device by order: " << inserted_device_type_index
-                    << "\nby idx: " << last_in_graph_index << std::endl;
-                created_scaleup_indices.insert(last_in_graph_index);
-            }
-        }
-    }
-
-    out << "\nStart indexer for graph list count: " << graph_list.size() << std::endl;
-    size_t accumulated_index_offset_for_graph = 0;
-    size_t graph_num = 0;
-
-    std::map<size_t /*graph_num*/, size_t /*offset*/>
-        index_offset_for_graphs; // calculate indexed devices count in each graph
-
-    ccl::device_indices_type total_device_indices;
-    for (const auto& graph : graph_list) {
-        total_device_indices.insert(graph.begin(), graph.end());
-    }
-
-    auto topology_comm_addr = comm_addr;
-    topology_comm_addr.comm_size = total_device_indices.size();
-
-    for (const auto& id_ring : graph_list) {
-        detail::id_thread_table assigned_ids; //device_id -> thread_id
-        auto& ctx_per_thread_data = context.thread_device_topology;
-        std::vector<detail::marked_idx> marked_id_ring =
-            detail::create_marked(id_ring); // marked graph
-        size_t index_offset = accumulated_index_offset_for_graph;
-
-        for (auto per_thread_it = ctx_per_thread_data.begin();
-             per_thread_it != ctx_per_thread_data.end();
-             ++per_thread_it) {
-            size_t thread_id = per_thread_it->first; //first
-
-            // prepare ropology
-            if (context.get_thread_topology<class_id>(thread_id).torn_apart_rings.empty()) {
-                context.get_thread_topology<class_id>(thread_id).set_additiona_topology(
-                    std::make_shared<device_community<class_id>>(topology_comm_addr));
-            }
-            auto& out_indexed_devices = context.get_thread_topology<class_id>(thread_id)
-                                            .get_additiona_topology(ring_index)
-                                            ->get_device_storage();
-
-            out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
-                << std::endl;
-            std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
-                devices_factory.thread_gpu_comms.find(thread_id)->second;
-
-            // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
-            auto rank_builder = create_device_functor<
-                detail::graph_ring_indexer_unique_index<group_id(), class_id>>(
-                marked_id_ring, assigned_ids, thread_id, out_indexed_devices, index_offset, 0, 0);
-
-            ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
-
-            // print partial topology enumeration for 'graph' from 'graph_list'
-            detail::printer<group_id(), class_id> p;
-            ccl_tuple_for_each(out_indexed_devices, p);
-            out << "Indexer result for devices in thread idx (" << thread_id << "/"
-                << ctx_per_thread_data.size() << "):\n"
-                << p.to_string() << std::endl;
-
-            // remember enumerated (marked) devices fro current thread & current graph
-            // to continue right enumeration order for other graphs & threas
-            accumulated_index_offset_for_graph +=
-                rank_builder.get_functor().get_marked_indices_count();
-            out << "\nIndexer for graph num: " << graph_num
-                << ", finished. imarked_indices: " << accumulated_index_offset_for_graph << "\n";
-        }
-        index_offset_for_graphs[graph_num] = index_offset;
-        graph_num++;
-    }
-
-    out << "\nStart ring builder for graphs count: " << graph_list.size() << std::endl;
-    graph_num = 0;
-    for (const auto& id_ring : graph_list) {
-        out << "\nStart ring builder for graph num: " << graph_num << std::endl;
-        for (size_t current_thread_idx = 0; current_thread_idx < ctx_per_thread_data.size();
-             current_thread_idx++) {
-            auto& indexed_devices_for_current_thread =
-                context.get_thread_topology<class_id>(current_thread_idx)
-                    .get_additiona_topology(ring_index)
-                    ->get_device_storage();
-
-            //find max device rank in current thread devices
-            const auto& curr_real =
-                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_virt =
-                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = detail::
-                get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
-                    indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_virt =
-                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                 group_id(),
-                                                 class_id>(indexed_devices_for_current_thread,
-                                                           id_ring);
-
-            size_t tg_max_rank = std::max({ std::get<0>(curr_real),
-                                            std::get<0>(curr_virt),
-                                            std::get<0>(curr_scale_real),
-                                            std::get<0>(curr_scale_virt) });
-            // find thread, which will connect to current thread max rank with next_rank
-            size_t next_rank = (tg_max_rank + 1) % id_ring.size();
-
-            out << "Current thread: " << current_thread_idx
-                << ", max rank candidates: " << std::get<0>(curr_real) << ", "
-                << std::get<0>(curr_virt) << ", " << std::get<0>(curr_scale_real) << ", "
-                << std::get<0>(curr_scale_virt) << ", selected max rank: " << tg_max_rank
-                << ", expected next_rank: " << next_rank << std::endl;
-
-            for (size_t next_thread_id = 0; next_thread_id < ctx_per_thread_data.size();
-                 next_thread_id++) {
-                if (next_thread_id == current_thread_idx) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                // search next_rank in that thread
-                auto& next_thread_ring_topology =
-                    context.get_thread_topology<class_id>(next_thread_id)
-                        .get_additiona_topology(ring_index)
-                        ->get_device_storage();
-                const auto& real =
-                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& virt =
-                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
-                        next_thread_ring_topology, id_ring);
-                const auto& scale_real =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                const auto& scale_virt =
-                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                     group_id(),
-                                                     class_id>(next_thread_ring_topology, id_ring);
-                if (next_rank != std::min({ std::get<0>(real),
-                                            std::get<0>(virt),
-                                            std::get<0>(scale_real),
-                                            std::get<0>(scale_virt) })) {
-                    // wrong thread, get next
-                    continue;
-                }
-
-                out << "next thread: " << next_thread_id
-                    << ", min rank candidates: " << std::get<0>(real) << ", " << std::get<0>(virt)
-                    << ", " << std::get<0>(scale_real) << ", " << std::get<0>(scale_virt)
-                    << std::endl;
-
-                out << "Lock ring for threads (" << current_thread_idx << " <-> " << next_thread_id
-                    << ")" << std::endl;
-                if (next_rank == std::get<0>(real)) {
-                    auto locker =
-                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
-                            next_rank,
-                            index_offset_for_graphs[graph_num],
-                            real,
-                            devices_factory,
-                            indexed_devices_for_current_thread);
-                    out << "Added real locker by index: "
-                        << index_offset_for_graphs[graph_num] + next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(virt)) {
-                    auto locker = detail::
-                        add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                            next_rank,
-                            index_offset_for_graphs[graph_num],
-                            virt,
-                            devices_factory,
-                            indexed_devices_for_current_thread);
-                    out << "Added virtual locker by index: "
-                        << index_offset_for_graphs[graph_num] + next_rank
-                        << ", for thread idx: " << current_thread_idx << ":\n"
-                        << locker->to_string() << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_real)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for numa real proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-                else if (next_rank == std::get<0>(scale_virt)) {
-                    out << "No need to add concurrent proxy for next thread: " << next_thread_id
-                        << " for numa virtual proxy in current thread: " << current_thread_idx
-                        << std::endl;
-                }
-            }
-        }
-        graph_num++;
-    }
-
-    out << "\nFinished building topology: " << ::to_string(class_id) << std::endl;
-    for (auto per_thread_it = ctx_per_thread_data.begin();
-         per_thread_it != ctx_per_thread_data.end();
-         ++per_thread_it) {
-        size_t thread_id = per_thread_it->first;
-
-        detail::printer<group_id(), class_id> p;
-        ccl_tuple_for_each(context.get_thread_topology<class_id>(thread_id)
-                               .get_additiona_topology(ring_index)
-                               ->get_device_storage(),
-                           p);
-        out << "\nFinal topology thread: " << thread_id << "\n" << p.to_string();
-    }
-    return true;
-}
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp
deleted file mode 100644
index 33de0c94d..000000000
--- a/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-
-namespace ccl {
-struct context_comm_addr;
-}
-
-namespace native {
-
-class thread_group_ring_topology {
-    thread_group_context& context;
-    device_storage& devices_factory;
-
-public:
-    static constexpr ccl::group_split_type group_id() {
-        return ccl::group_split_type::process;
-    }
-
-    static constexpr const char* name() {
-        return "thread_group_ring_creator";
-    }
-
-    thread_group_ring_topology(thread_group_context& ctx, device_storage& devs);
-
-    static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
-                                                         const ccl_device& rhs);
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    static detail::adjacency_matrix build_p2p_capability_matrix(
-        std::ostream& out,
-        const ccl::process_device_indices_type& per_thread_device_indices,
-        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-    bool build(std::ostream& out,
-               const ccl::context_comm_addr& context_addr,
-               const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-               const detail::adjacency_matrix& matrix,
-               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    bool build(std::ostream& out,
-               const ccl::context_comm_addr& context_addr,
-               const ccl::process_device_indices_type& per_thread_device_indices,
-               const detail::adjacency_matrix& matrix,
-               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
-
-private:
-    bool build_specific(std::ostream& out,
-                        const ccl::context_comm_addr& context_addr,
-                        const ccl::process_device_indices_type& per_thread_device_indices,
-                        const detail::plain_graph& graph);
-    bool build_scale_up_specific(std::ostream& out,
-                                 const ccl::context_comm_addr& context_addr,
-                                 const ccl::process_device_indices_type& per_thread_device_indicess,
-                                 const detail::plain_graph_list& graph_list);
-};
-} // namespace native
diff --git a/src/common/comm/l0/topology/ring_topology.hpp b/src/common/comm/l0/topology/ring_topology.hpp
deleted file mode 100644
index 19bcf4257..000000000
--- a/src/common/comm/l0/topology/ring_topology.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "common/comm/l0/topology/ring/device_group_ring_creator.hpp"
-#include "common/comm/l0/topology/ring/thread_group_ring_creator.hpp"
-#include "common/comm/l0/topology/ring/process_group_ring_creator.hpp"
diff --git a/src/common/comm/l0/topology/topology_construction_utils.cpp b/src/common/comm/l0/topology/topology_construction_utils.cpp
deleted file mode 100644
index 2f50b6901..000000000
--- a/src/common/comm/l0/topology/topology_construction_utils.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-
-namespace native {
-std::ostream& operator<<(std::ostream& out, const detail::adjacency_matrix& matrix) {
-    if (matrix.empty()) {
-        return out;
-    }
-
-    for (auto device_it : matrix) {
-        const ccl::device_index_type& left_index = device_it.first;
-        const auto& device_adjacencies = device_it.second;
-
-        out << left_index << "\t:\t{";
-        for (const auto& device_cross_rating_value : device_adjacencies) {
-            const ccl::device_index_type& right_index = device_cross_rating_value.first;
-            detail::cross_device_rating rating = device_cross_rating_value.second;
-            out << right_index << "/ " << rating << ", ";
-        }
-        out << "},\n";
-    }
-    out << std::endl;
-    return out;
-}
-
-namespace detail {
-std::ostream& operator<<(std::ostream& out, const adjacency_matrix& matrix) {
-    if (matrix.empty()) {
-        return out;
-    }
-
-    for (auto device_it : matrix) {
-        const ccl::device_index_type& left_index = device_it.first;
-        const auto& device_adjacencies = device_it.second;
-
-        out << ccl::to_string(left_index) << "\t:\t{";
-        for (const auto& device_cross_rating_value : device_adjacencies) {
-            const ccl::device_index_type& right_index = device_cross_rating_value.first;
-            detail::cross_device_rating rating = device_cross_rating_value.second;
-            out << ccl::to_string(right_index) << "/ " << rating << ", ";
-        }
-        out << "},\n";
-    }
-    out << std::endl;
-    return out;
-}
-
-std::ostream& operator<<(std::ostream& out, const colored_idx& idx) {
-    out << ccl::to_string(idx.index) << "/" << idx.color;
-    return out;
-}
-
-size_t property_p2p_rating_calculator(const native::ccl_device& lhs,
-                                      const native::ccl_device& rhs,
-                                      size_t weight) {
-    ze_device_p2p_properties_t p2p = lhs.get_p2p_properties(rhs);
-    if (p2p.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)
-        return weight;
-    else {
-        ze_bool_t access;
-        ze_result_t ret = zeDeviceCanAccessPeer(lhs.handle, rhs.handle, &access);
-        if (ret != ZE_RESULT_SUCCESS) {
-            throw std::runtime_error(std::string("Cannot execute zeDeviceCanAccessPeer, error: ") +
-                                     native::to_string(ret));
-        }
-        return access ? weight : 0;
-    }
-}
-
-std::string to_string(const plain_graph& cont) {
-    std::stringstream ss;
-    for (const auto& id : cont) {
-        ss << ccl::to_string(id) << ", ";
-    }
-    return ss.str();
-}
-
-std::string to_string(const plain_graph_list& lists, const std::string& prefix) {
-    std::stringstream ss;
-    ss << "Graphs counts: " << lists.size();
-    size_t graph_num = 0;
-    for (const plain_graph& graph : lists) {
-        ss << "\n\t" << prefix << graph_num++ << "\t" << to_string(graph);
-    }
-    return ss.str();
-}
-
-std::string to_string(const colored_plain_graph& cont) {
-    std::stringstream ss;
-    for (const auto& id : cont) {
-        ss << id << ", ";
-    }
-    return ss.str();
-}
-
-std::string to_string(const colored_plain_graph_list& lists, const std::string& prefix) {
-    std::stringstream ss;
-    ss << "Graphs counts: " << lists.size();
-    size_t graph_num = 0;
-    for (const colored_plain_graph& graph : lists) {
-        ss << "\n\t" << prefix << graph_num++ << "\t" << to_string(graph);
-    }
-    return ss.str();
-}
-
-template <class composite_container>
-std::string to_string_impl(const composite_container& cont) {
-    std::stringstream ss;
-    ss << "Cluster size: " << cont.size();
-    for (const auto& process_graphs : cont) {
-        ss << "\nprx: " << process_graphs.first << "\n{\n"
-           << to_string(process_graphs.second, "\t") << "\n},";
-    }
-    return ss.str();
-}
-
-std::string to_string(const global_sorted_plain_graphs& cluster) {
-    return std::string("Sorted - ") + to_string_impl(cluster);
-}
-
-std::string to_string(const global_plain_graphs& cluster) {
-    return std::string("Plain - ") + to_string_impl(cluster);
-}
-
-std::string to_string(const global_sorted_colored_plain_graphs& cluster) {
-    return std::string("Sorted Colored - ") + to_string_impl(cluster);
-}
-
-std::string to_string(const global_plain_colored_graphs& cluster) {
-    return std::string("Plain Colored - ") + to_string_impl(cluster);
-}
-
-void fill_adjacency_matrix_for_single_device_in_devices_by_cond(
-    const native::ccl_device& left_device,
-    const ccl::device_index_type& lhs_index,
-    const ccl_device_driver::devices_storage_type& devices,
-    adjacency_matrix& matrix,
-    p2p_rating_function ping,
-    std::function<bool(const ccl::device_index_type&)> rhs_filter) {
-    //TODO - more elegant way is needed
-    //TODO measure latency as additional weight argument???
-    const auto& l_subdevices = left_device.get_subdevices();
-    if (!l_subdevices.empty()) {
-        for (const auto& lhs_sub_pair : l_subdevices) {
-            const auto& left_subdevice = *lhs_sub_pair.second;
-            const auto& lhs_sub_index = left_subdevice.get_device_path();
-
-            for (const auto& rhs_pair : devices) {
-                const auto& right_device = *rhs_pair.second;
-                const auto& rhs_index = right_device.get_device_path();
-
-                if (!rhs_filter or rhs_filter(rhs_index)) //check cond on right
-                {
-                    const auto& right_subdevices = right_device.get_subdevices();
-                    for (const auto& rhs_sub_pair : right_subdevices) {
-                        const auto& right_subdevice = *rhs_sub_pair.second;
-                        const auto& rhs_sub_index = right_subdevice.get_device_path();
-
-                        if (!rhs_filter or rhs_filter(rhs_sub_index)) //check cond on right
-                        {
-                            // across subdevices only
-                            matrix[lhs_sub_index][rhs_sub_index] =
-                                ping(left_subdevice, right_subdevice);
-                            matrix[rhs_sub_index][lhs_sub_index] =
-                                ping(right_subdevice, left_subdevice);
-                            // across left device & right subdevices only
-                            matrix[lhs_index][rhs_sub_index] = ping(left_device, right_subdevice);
-                            matrix[rhs_sub_index][lhs_index] = ping(right_subdevice, left_device);
-                        }
-                    }
-
-                    // across left sub devices & right device only
-                    matrix[lhs_sub_index][rhs_index] = ping(left_subdevice, right_device);
-                    matrix[rhs_index][lhs_sub_index] = ping(right_device, left_subdevice);
-
-                    // across left device & right device only
-                    matrix[lhs_index][rhs_index] = ping(left_device, right_device);
-                    matrix[rhs_index][lhs_index] = ping(right_device, left_device);
-                }
-            }
-        }
-    }
-    else {
-        for (const auto& rhs_pair : devices) {
-            const auto& right_device = *rhs_pair.second;
-            const auto& rhs_index = right_device.get_device_path();
-
-            if (!rhs_filter or rhs_filter(rhs_index)) //check cond on right
-            {
-                const auto& right_subdevices = right_device.get_subdevices();
-                for (const auto& rhs_sub_pair : right_subdevices) {
-                    const auto& right_subdevice = *rhs_sub_pair.second;
-                    const auto& rhs_sub_index = right_subdevice.get_device_path();
-
-                    if (!rhs_filter or rhs_filter(rhs_index)) //check cond on right
-                    {
-                        // across left device & right subdevices only
-                        matrix[lhs_index][rhs_sub_index] = ping(left_device, right_subdevice);
-                        matrix[rhs_sub_index][lhs_index] = ping(right_subdevice, left_device);
-
-                        // across left device & right subdevices only
-                        matrix[rhs_sub_index][lhs_index] = ping(right_subdevice, left_device);
-                        matrix[lhs_index][rhs_sub_index] = ping(left_device, right_subdevice);
-                    }
-                }
-
-                // across left device & right device only
-                matrix[lhs_index][rhs_index] = ping(left_device, right_device);
-                matrix[rhs_index][lhs_index] = ping(right_device, left_device);
-                // across left device & right device only
-                matrix[rhs_index][lhs_index] = ping(right_device, left_device);
-                matrix[lhs_index][rhs_index] = ping(left_device, right_device);
-            }
-        }
-    }
-}
-
-void fill_adjacency_matrix_for_single_device_in_devices(
-    const native::ccl_device& left_device,
-    const ccl::device_index_type& lhs_index,
-    const ccl_device_driver::devices_storage_type& devices,
-    adjacency_matrix& matrix,
-    p2p_rating_function ping) {
-    //TODO - more elegant way is needed
-    //TODO measure latency as additional weight argument???
-    const auto& l_subdevices = left_device.get_subdevices();
-    if (!l_subdevices.empty()) {
-        for (const auto& lhs_sub_pair : l_subdevices) {
-            const auto& left_subdevice = *lhs_sub_pair.second;
-            const auto& lhs_sub_index = left_subdevice.get_device_path();
-
-            for (const auto& rhs_pair : devices) {
-                const auto& right_device = *rhs_pair.second;
-                const auto& rhs_index = right_device.get_device_path();
-
-                const auto& right_subdevices = right_device.get_subdevices();
-                for (const auto& rhs_sub_pair : right_subdevices) {
-                    const auto& right_subdevice = *rhs_sub_pair.second;
-                    const auto& rhs_sub_index = right_subdevice.get_device_path();
-
-                    // across subdevices only
-                    matrix[lhs_sub_index][rhs_sub_index] = ping(left_subdevice, right_subdevice);
-
-                    // across left device & right subdevices only
-                    matrix[lhs_index][rhs_sub_index] = ping(left_device, right_subdevice);
-                }
-
-                // across left sub devices & right device only
-                matrix[lhs_sub_index][rhs_index] = ping(left_subdevice, right_device);
-
-                // across left device & right device only
-                matrix[lhs_index][rhs_index] = ping(left_device, right_device);
-            }
-        }
-    }
-    else {
-        for (const auto& rhs_pair : devices) {
-            const auto& right_device = *rhs_pair.second;
-            const auto& rhs_index = right_device.get_device_path();
-
-            const auto& right_subdevices = right_device.get_subdevices();
-            for (const auto& rhs_sub_pair : right_subdevices) {
-                const auto& right_subdevice = *rhs_sub_pair.second;
-                const auto& rhs_sub_index = right_subdevice.get_device_path();
-
-                // across left device & right subdevices only
-                matrix[lhs_index][rhs_sub_index] = ping(left_device, right_subdevice);
-            }
-
-            // across left device & right device only
-            matrix[lhs_index][rhs_index] = ping(left_device, right_device);
-        }
-    }
-}
-
-adjacency_matrix create_adjacency_matrix_for_devices(
-    const ccl_device_driver::devices_storage_type& devices,
-    p2p_rating_function ping) {
-    adjacency_matrix matrix;
-    for (const auto& lhs_pair : devices) {
-        const auto& left_device = *lhs_pair.second;
-        const auto& lhs_index = left_device.get_device_path();
-
-        fill_adjacency_matrix_for_single_device_in_devices_by_cond(
-            left_device, lhs_index, devices, matrix, ping);
-    }
-    return matrix;
-}
-
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::device_indices_type& device_indexes) {
-    plain_graph ids_ring;
-
-    std::multimap<ccl::device_index_type, bool> marked_indices;
-    std::transform(device_indexes.begin(),
-                   device_indexes.end(),
-                   std::inserter(marked_indices, marked_indices.end()),
-                   [](const ccl::device_index_type& idx) {
-                       return std::pair<ccl::device_index_type, bool>{ idx, false };
-                   });
-
-    ids_ring.push_back(marked_indices.begin()->first);
-    marked_indices.erase(marked_indices.begin());
-    try {
-        while (!marked_indices.empty()) {
-            auto it = marked_indices.begin();
-
-            //find next idx from elapsed
-            bool find = false;
-            for (; it != marked_indices.end(); ++it) {
-                if (it->second == true)
-                    continue; //skip dirty index
-
-                auto adjacencies_list_it = matrix.find(ids_ring.back());
-
-                //sanity check
-                if (adjacencies_list_it == matrix.end()) {
-                    throw std::runtime_error(std::string("Requested invalid device index: ") +
-                                             ccl::to_string(ids_ring.back()) +
-                                             ". Check adjacency_matrix construction");
-                }
-
-                const adjacency_list& device_adjacencies = adjacencies_list_it->second;
-
-                auto rating_it = device_adjacencies.find(it->first);
-                if (rating_it == device_adjacencies.end()) {
-                    throw std::runtime_error(std::string("Requested invalid adjacency index: ") +
-                                             ccl::to_string(it->first) + ", for parent device: " +
-                                             ccl::to_string(ids_ring.back()) +
-                                             ". Check adjacency_matrix construction");
-                }
-
-                detail::cross_device_rating rating = rating_it->second;
-                if (rating != 0) {
-                    //find next
-                    ids_ring.push_back(it->first);
-                    marked_indices.erase(it);
-                    find = true;
-                    break;
-                }
-            }
-
-            if (!find) //cannot find next node
-            {
-                /*if(ids_ring.empty())
-                {
-                    throw std::logic_error("qqq");
-                }*/
-                //the current device cannot communicate with any other
-                ccl::device_index_type idx = ids_ring.back();
-                ids_ring.pop_back();
-                if (ids_ring.empty()) {
-                    throw std::logic_error("No one device has no access to others");
-                }
-
-                //mark it as dirty
-                auto inserted_it = marked_indices.emplace(idx, true);
-                //get next device
-                std::for_each(
-                    inserted_it,
-                    marked_indices.end(),
-                    [](typename std::multimap<ccl::device_index_type, bool>::value_type& idx) {
-                        idx.second = false;
-                    });
-            }
-        }
-    }
-    catch (const std::exception& ex) {
-        std::cerr << __PRETTY_FUNCTION__ << " - exception: " << ex.what() << std::endl;
-        std::cerr << __PRETTY_FUNCTION__ << "Adjacencies matrix:\n" << matrix << std::endl;
-
-        abort();
-        return {};
-    }
-    return ids_ring;
-}
-
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_device_indices_type& per_process_device_indexes) {
-    plain_graph ids_ring;
-
-    for (const auto& thread_group_val : per_process_device_indexes) {
-        const auto& indices = thread_group_val;
-        auto group_devices = graph_resolver(matrix, indices.second);
-        ids_ring.insert(ids_ring.end(), group_devices.begin(), group_devices.end());
-    }
-    return ids_ring;
-}
-
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_aggregated_device_mask_t& per_process_device_masks) {
-    plain_graph ids_ring;
-
-    for (const auto& thread_group_val : per_process_device_masks) {
-        const auto& indices = ccl_device_driver::get_device_indices(thread_group_val.second);
-        auto group_devices = graph_resolver(matrix, indices);
-        ids_ring.insert(ids_ring.end(), group_devices.begin(), group_devices.end());
-    }
-    return ids_ring;
-}
-
-/* graph list creation utils */
-plain_graph_list graph_list_resolver(const adjacency_matrix& matrix,
-                                     const ccl::device_indices_type& device_indexes) {
-    plain_graph_list isles;
-
-    using marked_storage = std::multimap<ccl::device_index_type, bool>;
-    marked_storage marked_indices;
-    std::transform(device_indexes.begin(),
-                   device_indexes.end(),
-                   std::inserter(marked_indices, marked_indices.end()),
-                   [](const ccl::device_index_type& idx) {
-                       return std::pair<ccl::device_index_type, bool>{ idx, false };
-                   });
-
-    plain_graph cur_graph;
-    cur_graph.push_back(marked_indices.begin()->first);
-    marked_indices.erase(marked_indices.begin());
-
-    // maximization problem
-    using maximization_solution_data_slice = std::tuple<plain_graph, marked_storage>;
-    maximization_solution_data_slice max_slice(cur_graph, marked_indices);
-    enum { MAX_GRAPH, MAX_MARKED };
-
-    try {
-        while (!marked_indices.empty()) {
-            auto it = marked_indices.begin();
-
-            //find next idx from elapsed
-            bool find = false;
-            for (; it != marked_indices.end(); ++it) {
-                ccl::device_index_type index{};
-                bool marked{};
-
-                std::tie(index, marked) = *it;
-                if (marked) {
-                    continue; //skip dirty index
-                }
-
-                auto adjacencies_list_it = matrix.find(cur_graph.back());
-
-                //sanity check
-                if (adjacencies_list_it == matrix.end()) {
-                    throw std::runtime_error(std::string("Requested invalid device index: ") +
-                                             ccl::to_string(cur_graph.back()) +
-                                             ". Check adjacency_matrix construction");
-                }
-
-                const adjacency_list& device_adjacencies = adjacencies_list_it->second;
-
-                auto rating_it = device_adjacencies.find(index);
-                if (rating_it == device_adjacencies.end()) {
-                    throw std::runtime_error(
-                        std::string("Requested invalid adjacency index: ") + ccl::to_string(index) +
-                        ", for parent device: " + ccl::to_string(cur_graph.back()) +
-                        ". Check adjacency_matrix construction");
-                }
-
-                detail::cross_device_rating rating = rating_it->second;
-                if (rating != 0) {
-                    //find next
-                    cur_graph.push_back(index);
-                    marked_indices.erase(it);
-                    find = true;
-
-                    //update maximization data
-                    if (cur_graph.size() > std::get<MAX_GRAPH>(max_slice).size()) {
-                        std::get<MAX_GRAPH>(max_slice) = cur_graph;
-                        std::get<MAX_MARKED>(max_slice) = marked_indices;
-                    }
-                    break;
-                }
-            }
-
-            if (!find) //cannot find next node
-            {
-                //the current device cannot communicate with any other
-                ccl::device_index_type idx = cur_graph.back();
-                cur_graph.pop_back();
-                if (cur_graph.empty()) {
-                    //push the longest graph path into isles
-                    isles.push_back(std::get<MAX_GRAPH>(max_slice));
-
-                    // get current marked slice
-                    marked_indices = std::get<MAX_MARKED>(max_slice);
-
-                    // check end
-                    if (marked_indices.empty()) {
-                        return isles;
-                    }
-
-                    //reboot searching parameters
-                    cur_graph.push_back(marked_indices.begin()->first);
-                    marked_indices.erase(marked_indices.begin());
-                    std::get<MAX_GRAPH>(max_slice) = cur_graph;
-                    std::get<MAX_MARKED>(max_slice) = marked_indices;
-                }
-                else {
-                    //mark it as dirty
-                    auto inserted_it = marked_indices.emplace(idx, true);
-                    //get next device
-                    if (inserted_it != marked_indices.end()) {
-                        ++inserted_it;
-                        std::for_each(inserted_it,
-                                      marked_indices.end(),
-                                      [](typename std::multimap<ccl::device_index_type,
-                                                                bool>::value_type& idx) {
-                                          idx.second = false;
-                                      });
-                    }
-                }
-            }
-        }
-
-        //process last
-        if (!std::get<MAX_GRAPH>(max_slice).empty()) {
-            isles.push_back(std::get<MAX_GRAPH>(max_slice));
-        }
-    }
-    catch (const std::exception& ex) {
-        std::cerr << __PRETTY_FUNCTION__ << " - exception: " << ex.what() << std::endl;
-        std::cerr << __PRETTY_FUNCTION__ << "Adjacencies matrix:\n" << matrix << std::endl;
-
-        abort();
-        return {};
-    }
-    return isles;
-}
-
-template <class device_idx_container>
-struct index_extractor {
-    using T = device_idx_container;
-};
-
-template <>
-struct index_extractor<ccl::device_index_type> {
-    static const ccl::device_index_type& index(const ccl::device_index_type& in) {
-        return in;
-    }
-};
-
-template <>
-struct index_extractor<typename colored_plain_graph::value_type> {
-    static const ccl::device_index_type& index(const typename colored_plain_graph::value_type& in) {
-        return in.index;
-    }
-};
-
-template <template <class...> class container, class graph_list, class index_getter>
-graph_list merge_graphs_stable(const container<graph_list>& lists,
-                               detail::p2p_rating_function ping,
-                               index_getter get,
-                               bool brake_on_incompatible,
-                               bool to_right,
-                               size_t& merged_process_count) {
-    merged_process_count = 0;
-    graph_list isles;
-    for (const auto& group_graph_list : lists) {
-        // merge into single list
-        // first graph list becomes first
-        if (isles.empty()) {
-            isles = group_graph_list;
-            merged_process_count++;
-            continue;
-        }
-
-        graph_list list_to_merge;
-        for (auto graph_it = group_graph_list.begin(); graph_it != group_graph_list.end();
-             ++graph_it) {
-            const auto& graph = *graph_it;
-            if (graph.empty()) {
-                continue;
-            }
-
-            // find accessible pairs
-            bool merged = false;
-            const auto& graph_first_device =
-                get_platform().get_device(index_getter::index(*graph.begin()));
-            const auto& graph_last_device =
-                get_platform().get_device(index_getter::index(*graph.rbegin()));
-            for (auto total_graph_it = isles.begin(); total_graph_it != isles.end();
-                 ++total_graph_it) {
-                auto& total_graph = *total_graph_it;
-                if (total_graph.empty()) {
-                    total_graph.insert(total_graph.end(), graph.begin(), graph.end());
-                    merged = true;
-                    break;
-                }
-
-                const auto& total_graph_first_device =
-                    get_platform().get_device(index_getter::index(*total_graph.begin()));
-                const auto& total_graph_last_device =
-                    get_platform().get_device(index_getter::index(*total_graph.rbegin()));
-                if (to_right) {
-                    if (ping(*total_graph_last_device, *graph_first_device)) {
-                        total_graph.insert(total_graph.end(), graph.begin(), graph.end());
-                        merged = true;
-                        break;
-                    }
-                    else if (ping(*graph_last_device, *total_graph_first_device)) {
-                        auto tmp_graph = graph;
-                        tmp_graph.insert(tmp_graph.end(), total_graph.begin(), total_graph.end());
-                        total_graph.swap(tmp_graph);
-                        merged = true;
-                        break;
-                    }
-                }
-                else {
-                    if (ping(*graph_last_device, *total_graph_first_device)) {
-                        auto tmp_graph = graph;
-                        tmp_graph.insert(tmp_graph.end(), total_graph.begin(), total_graph.end());
-                        total_graph.swap(tmp_graph);
-                        merged = true;
-                        break;
-                    }
-                }
-            }
-
-            if (!merged) {
-                if (brake_on_incompatible) {
-                    return isles;
-                }
-                list_to_merge.push_back(graph);
-            }
-
-            merged_process_count++;
-        }
-        std::copy(list_to_merge.begin(), list_to_merge.end(), std::back_inserter(isles));
-    }
-    return isles;
-}
-
-bool check_graph_a2a_capable(const plain_graph& graph,
-                             const adjacency_matrix& matrix,
-                             std::ostream& out) {
-    bool a2a_capable = true;
-    size_t graph_power = graph.size();
-    out << __FUNCTION__ << " - graph power: " << graph_power << std::endl;
-    for (const ccl::device_index_type& lhs_index : graph) {
-        size_t device_power = 0;
-        auto m_it = matrix.find(lhs_index);
-        if (m_it == matrix.end()) {
-            std::stringstream ss;
-            ss << __FUNCTION__ << " - invalid control matrix: no device by "
-               << ccl::to_string(lhs_index);
-            out << ss.str();
-            throw std::runtime_error(ss.str());
-        }
-
-        const detail::adjacency_list& control_list = m_it->second;
-        for (const ccl::device_index_type& rhs_index : graph) {
-            auto c_it = control_list.find(rhs_index);
-            if (c_it != control_list.end() and c_it->second != 0) {
-                device_power++;
-            }
-        }
-
-        out << "device " << ccl::to_string(lhs_index)
-            << ", has connection point count: " << device_power << std::endl;
-        if (device_power != graph_power) {
-            a2a_capable = false;
-            break;
-        }
-    }
-    return a2a_capable;
-}
-
-plain_graph_list merge_graph_lists_stable(const std::list<plain_graph_list>& lists,
-                                          detail::p2p_rating_function ping,
-                                          bool brake_on_incompatible) {
-    size_t merged_process_count = 0;
-    return merge_graphs_stable(lists,
-                               ping,
-                               index_extractor<ccl::device_index_type>{},
-                               brake_on_incompatible,
-                               true,
-                               merged_process_count);
-}
-
-colored_plain_graph_list merge_graph_lists_stable(const std::list<colored_plain_graph_list>& lists,
-                                                  detail::p2p_rating_function ping,
-                                                  bool brake_on_incompatible) {
-    size_t merged_process_count = 0;
-    return merge_graphs_stable(lists,
-                               ping,
-                               index_extractor<typename colored_plain_graph::value_type>{},
-                               brake_on_incompatible,
-                               true,
-                               merged_process_count);
-}
-
-colored_plain_graph_list merge_graph_lists_stable_for_process(
-    const std::list<colored_plain_graph_list>& lists,
-    detail::p2p_rating_function ping,
-    bool to_right,
-    size_t& merged_process_count) {
-    return merge_graphs_stable(lists,
-                               ping,
-                               index_extractor<typename colored_plain_graph::value_type>{},
-                               true,
-                               to_right,
-                               merged_process_count);
-}
-
-plain_graph_list graph_list_resolver(
-    const adjacency_matrix& matrix,
-    const ccl::process_device_indices_type& per_process_device_indexes,
-    detail::p2p_rating_function ping) {
-    std::list<plain_graph_list> lists;
-    for (const auto& thread_group_val : per_process_device_indexes) {
-        lists.emplace_back(graph_list_resolver(matrix, thread_group_val.second));
-    }
-    return merge_graph_lists_stable(lists, ping);
-}
-/*
-        // merge into single list
-        if (isles.empty())
-        {
-            isles.swap(group_graph_list);
-            continue;
-        }
-        plain_graph_list list_to_merge;
-        for(auto graph_it = group_graph_list.begin(); graph_it != group_graph_list.end(); ++graph_it)
-        {
-            plain_graph& graph = *graph_it;
-            if (graph.empty())
-            {
-                continue;
-            }
-            // find accessible pairs
-            bool merged = false;
-            const auto& graph_first_device = get_platform().get_device(*graph.begin());
-            const auto& graph_last_device = get_platform().get_device(*graph.rbegin());
-            for(auto total_graph_it = isles.begin(); total_graph_it != isles.end(); ++total_graph_it)
-            {
-                plain_graph& total_graph = *total_graph_it;
-                if (total_graph.empty())
-                {
-                     total_graph.insert(total_graph.end(), graph.begin(), graph.end());
-                     merged = true;
-                     break;
-                }
-                const auto& total_graph_first_device = get_platform().get_device(*total_graph.begin());
-                const auto& total_graph_last_device = get_platform().get_device(*total_graph.rbegin());
-                if (ping(*graph_first_device, *total_graph_last_device))
-                {
-                    total_graph.insert(total_graph.end(), graph.begin(), graph.end());
-                    merged = true;
-                    break;
-                }
-                else if(ping(*graph_last_device, *total_graph_first_device))
-                {
-                    graph.insert(graph.end(), total_graph.begin(), total_graph.end());
-                    total_graph.swap(graph);
-                    merged = true;
-                    break;
-                }
-            }
-            if (!merged)
-            {
-                list_to_merge.push_back(graph);
-            }
-        }
-        std::copy(list_to_merge.begin(), list_to_merge.end(), std::back_inserter(isles));
-    }
-    return isles;
-}
-*/
-plain_graph_list graph_list_resolver(
-    const adjacency_matrix& matrix,
-    const ccl::process_aggregated_device_mask_t& per_process_device_masks) {
-    plain_graph_list isles;
-    return isles;
-}
-
-void reset_color(colored_plain_graph_list& list, color_t new_color) {
-    for (auto& graph : list) {
-        for (colored_idx& idx : graph) {
-            idx.color = new_color;
-        }
-    }
-}
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/topology/topology_construction_utils.hpp b/src/common/comm/l0/topology/topology_construction_utils.hpp
deleted file mode 100644
index e8fb4d3ec..000000000
--- a/src/common/comm/l0/topology/topology_construction_utils.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <list>
-
-#include "common/comm/l0/device_community.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-#include "common/comm/l0/topology/topology_creator.hpp"
-#include "common/comm/l0/topology/topology_declarations.hpp"
-#include "oneapi/ccl/native_device_api/l0/utils.hpp"
-
-class device_group_router;
-#define DEVICE_GROUP_WEIGHT  9
-#define THREAD_GROUP_WEIGHT  5
-#define PROCESS_GROUP_WEIGHT 2
-
-namespace native {
-struct process_group_context;
-struct thread_group_context;
-struct device_group_context;
-struct device_storage;
-struct ccl_device;
-
-namespace detail {
-
-adjacency_matrix create_adjacency_matrix_for_devices(
-    const ccl_device_driver::devices_storage_type& devices,
-    p2p_rating_function ping);
-
-void fill_adjacency_matrix_for_single_device_in_devices(
-    const native::ccl_device& lhs_device,
-    const ccl::device_index_type& lhs_index,
-    const ccl_device_driver::devices_storage_type& devices,
-    adjacency_matrix& matrix,
-    p2p_rating_function ping);
-
-void fill_adjacency_matrix_for_single_device_in_devices_by_cond(
-    const native::ccl_device& lhs_device,
-    const ccl::device_index_type& lhs_index,
-    const ccl_device_driver::devices_storage_type& devices,
-    adjacency_matrix& matrix,
-    p2p_rating_function ping,
-    std::function<bool(const ccl::device_index_type&)> rhs_filter =
-        std::function<bool(const ccl::device_index_type&)>());
-
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::device_indices_type& device_indexes);
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_device_indices_type& per_process_device_indexes);
-plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_aggregated_device_mask_t& per_process_device_masks);
-
-plain_graph_list graph_list_resolver(const adjacency_matrix& matrix,
-                                     const ccl::device_indices_type& device_indexes);
-plain_graph_list graph_list_resolver(
-    const adjacency_matrix& matrix,
-    const ccl::process_device_indices_type& per_process_device_indexes,
-    detail::p2p_rating_function ping);
-
-plain_graph_list graph_list_resolver(
-    const adjacency_matrix& matrix,
-    const ccl::process_aggregated_device_mask_t& per_process_device_masks);
-
-bool check_graph_a2a_capable(const plain_graph& graph,
-                             const adjacency_matrix& matrix,
-                             std::ostream& out);
-
-plain_graph_list merge_graph_lists_stable(const std::list<plain_graph_list>& lists,
-                                          detail::p2p_rating_function ping,
-                                          bool brake_on_incompatible = false);
-
-colored_plain_graph_list merge_graph_lists_stable(const std::list<colored_plain_graph_list>& lists,
-                                                  detail::p2p_rating_function ping,
-                                                  bool brake_on_incompatible = false);
-colored_plain_graph_list merge_graph_lists_stable_for_process(
-    const std::list<colored_plain_graph_list>& lists,
-    detail::p2p_rating_function ping,
-    bool to_right,
-    size_t& merged_process_count);
-
-size_t property_p2p_rating_calculator(const native::ccl_device& lhs,
-                                      const native::ccl_device& rhs,
-                                      size_t weight);
-
-void reset_color(colored_plain_graph_list& list, color_t new_color);
-} // namespace detail
-std::ostream& operator<<(std::ostream& out, const detail::adjacency_matrix& matrix);
-
-} // namespace native
diff --git a/src/common/comm/l0/topology/topology_creator.hpp b/src/common/comm/l0/topology/topology_creator.hpp
deleted file mode 100644
index 5a3dd1ca6..000000000
--- a/src/common/comm/l0/topology/topology_creator.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-namespace native {
-namespace detail {
-
-template <class F>
-struct device_group_container_functor {
-    template <class... Args>
-    device_group_container_functor(Args&&... args) : operation(std::forward<Args>(args)...) {}
-
-    template <class device_container_t>
-    void operator()(device_container_t& container) {
-        operation(container);
-    }
-    F& get_functor() {
-        return operation;
-    }
-
-private:
-    F operation;
-};
-} // namespace detail
-
-template <class F, class... Args>
-detail::device_group_container_functor<F> create_device_functor(Args&&... args) {
-    return detail::device_group_container_functor<F>(std::forward<Args>(args)...);
-}
-} // namespace native
diff --git a/src/common/comm/l0/topology/topology_declarations.hpp b/src/common/comm/l0/topology/topology_declarations.hpp
deleted file mode 100644
index f434fa293..000000000
--- a/src/common/comm/l0/topology/topology_declarations.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include <list>
-#include <map>
-#include <vector>
-
-#include "oneapi/ccl/native_device_api/l0/utils.hpp"
-
-namespace native {
-namespace detail {
-struct marked_idx : std::pair<bool, ccl::device_index_type> {
-    marked_idx(bool m, ccl::device_index_type i) : std::pair<bool, ccl::device_index_type>(m, i) {}
-};
-
-using color_t = size_t; //consider std::optional
-
-struct colored_index {
-    colored_index(color_t c, const ccl::device_index_type& i) : color(c), index(i) {}
-    color_t color;
-    ccl::device_index_type index;
-
-    bool operator==(const colored_index& rhs) const noexcept {
-        return (color == rhs.color) and (index == rhs.index);
-    }
-};
-
-template <class data_t>
-struct colored_indexed_data : public colored_index {
-    using payload_t = data_t;
-
-    colored_indexed_data(color_t c,
-                         const ccl::device_index_type& i,
-                         const payload_t& t = payload_t{})
-            : colored_index(c, i),
-              payload(t) {}
-
-    const payload_t& get_payload() const {
-        return payload;
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "color: " << color << ", index:" << index << ", data: "
-           << "STUB";
-        ;
-        return ss.str();
-    }
-
-private:
-    payload_t payload;
-};
-
-template <>
-struct colored_indexed_data<void> : public colored_index {
-    colored_indexed_data(color_t c, const ccl::device_index_type& i) : colored_index(c, i) {}
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "color: " << color << ", index:" << index;
-        return ss.str();
-    }
-};
-
-using colored_idx = colored_indexed_data<void>;
-
-using plain_graph = std::vector<ccl::device_index_type>;
-using plain_graph_list = std::list<plain_graph>;
-using colored_plain_graph = std::vector<colored_idx>;
-using colored_plain_graph_list = std::list<colored_plain_graph>;
-
-using process_index_t = size_t;
-using global_sorted_plain_graphs = std::map<process_index_t, plain_graph_list>;
-using global_plain_graphs = std::vector<std::pair<process_index_t, plain_graph_list>>;
-using global_sorted_colored_plain_graphs = std::map<process_index_t, colored_plain_graph_list>;
-using global_plain_colored_graphs =
-    std::vector<std::pair<process_index_t, colored_plain_graph_list>>;
-using global_colored_plain_graphs = global_plain_colored_graphs;
-
-std::string to_string(const plain_graph& cont);
-std::string to_string(const plain_graph_list& lists, const std::string& prefix = std::string());
-std::string to_string(const global_sorted_plain_graphs& cluster);
-std::string to_string(const global_plain_graphs& cluster);
-std::string to_string(const colored_plain_graph& cont);
-std::string to_string(const colored_plain_graph_list& lists,
-                      const std::string& prefix = std::string());
-std::string to_string(const global_sorted_colored_plain_graphs& cluster);
-std::string to_string(const global_plain_colored_graphs& cluster);
-
-std::ostream& operator<<(std::ostream& out, const colored_idx& idx);
-} // namespace detail
-
-template <class payload_type>
-std::ostream& operator<<(std::ostream& out,
-                         const detail::colored_indexed_data<payload_type>& data) {
-    out << data.to_string();
-    return out;
-}
-} // namespace native
diff --git a/src/common/comm/l0/topology/topology_serializer.cpp b/src/common/comm/l0/topology/topology_serializer.cpp
deleted file mode 100644
index dc588075a..000000000
--- a/src/common/comm/l0/topology/topology_serializer.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/topology/topology_serializer.hpp"
-
-namespace native {
-namespace detail {
-namespace serialize {
-device_path_serializable::raw_data_t device_path_serializable::result() {
-    return data;
-}
-
-device_path_serializer::device_path_serializer(size_t expected_devices,
-                                               size_t data_offset,
-                                               size_t stride)
-        : base(),
-          expected_capacity(expected_devices * (device_index_size() + stride) + data_offset),
-          stride_bytes(stride)
-
-{
-    data.reserve(expected_capacity);
-
-    //fill preambule by zeros
-    for (size_t i = 0; i < data_offset; i++) {
-        data.push_back(0);
-    }
-}
-
-template <class T>
-device_path_serializer::raw_data_t device_path_serializer::serialize_indices_impl(
-    const std::list<T>& list,
-    size_t data_offset) {
-    std::list<raw_data_t> serialized_list;
-    size_t list_size = list.size();
-
-    size_t total_size = sizeof(list_size) + data_offset;
-    for (const auto& graph : list) {
-        size_t graph_count = graph.size();
-        raw_data_t serialized_graph =
-            device_path_serializer::serialize_indices(graph, sizeof(graph_count));
-        //copy graph count into preambule to recover multiple graphs
-        memcpy(serialized_graph.data(), &graph_count, sizeof(graph_count));
-
-        total_size += serialized_graph.size();
-        serialized_list.push_back(std::move(serialized_graph));
-    }
-
-    raw_data_t total_data;
-    total_data.reserve(total_size); //graphs with preambules + list size;
-
-    //fill global preambule: list size
-    for (size_t i = 0; i < data_offset + sizeof(list_size); i++) {
-        total_data.push_back(0);
-    }
-    memcpy(total_data.data() + data_offset, &list_size, sizeof(list_size));
-
-    //use std::accumulate in c++20
-    for (const raw_data_t& data : serialized_list) {
-        std::copy(data.begin(), data.end(), std::back_inserter(total_data));
-    }
-
-    /* [data_offset] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... */
-    return total_data;
-}
-
-template <class T>
-device_path_serializer::raw_data_t device_path_serializer::serialize_indices_impl(
-    const std::map<size_t, T>& list) {
-    std::list<raw_data_t> serialized_list;
-    size_t cluster_size = list.size();
-
-    size_t total_size = sizeof(cluster_size); //preambule size
-    for (const auto& process_graph_list : list) {
-        raw_data_t serialized_graph = device_path_serializer::serialize_indices(
-            process_graph_list.second, sizeof(process_graph_list.first));
-        /* [process_id] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... */
-        memcpy(
-            serialized_graph.data(), &process_graph_list.first, sizeof(process_graph_list.first));
-
-        total_size += serialized_graph.size();
-        serialized_list.push_back(std::move(serialized_graph));
-    }
-
-    raw_data_t total_data;
-    total_data.reserve(total_size); //process graphs with preambules + cluster size;
-
-    //fill global preambule: list size
-    for (size_t i = 0; i < sizeof(cluster_size); i++) {
-        total_data.push_back(0);
-    }
-    memcpy(total_data.data(), &cluster_size, sizeof(cluster_size));
-
-    //use std::accumulate in c++20
-    for (const raw_data_t& data : serialized_list) {
-        std::copy(data.begin(), data.end(), std::back_inserter(total_data));
-    }
-
-    /* [cluster_size] [process_id] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... */
-    return total_data;
-}
-
-device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const detail::plain_graph_list& list,
-    size_t data_offset) {
-    /*
-    std::list<raw_data_t> serialized_list;
-    size_t list_size = list.size();
-    size_t total_size = sizeof(list_size) + data_offset;
-    for (const detail::plain_graph& graph : list)
-    {
-        size_t graph_count = graph.size();
-        raw_data_t serialized_graph = device_path_serializer::serialize_indices(graph,
-                                                                                sizeof(graph_count));
-        //copy graph count into preambule to recover multiple graphs
-        memcpy(serialized_graph.data(), &graph_count, sizeof(graph_count));
-        total_size += serialized_graph.size();
-        serialized_list.push_back(std::move(serialized_graph));
-    }
-    raw_data_t total_data;
-    total_data.reserve(total_size); //graphs with preambules + list size;
-    //fill global preambule: list size
-    for(size_t i = 0; i < data_offset + sizeof(list_size); i++)
-    {
-        total_data.push_back(0);
-    }
-    memcpy(total_data.data() + data_offset, &list_size, sizeof(list_size));
-    //use std::accumulate in c++20
-    for (const raw_data_t& data: serialized_list)
-    {
-        std::copy(data.begin(), data.end(), std::back_inserter(total_data));
-    }
-    / * [data_offset] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... * /
-    return total_data;*/
-    return device_path_serializer::serialize_indices_impl(list, data_offset);
-}
-
-device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const detail::global_sorted_plain_graphs& list) {
-    /*std::list<raw_data_t> serialized_list;
-    size_t cluster_size = list.size();
-    size_t total_size = sizeof(cluster_size); //preambule size
-    for (const auto& process_graph_list : list)
-    {
-        raw_data_t serialized_graph =
-                device_path_serializer::serialize_indices(process_graph_list.second,
-                                                          sizeof(process_graph_list.first));
-        / * [process_id] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... * /
-        memcpy(serialized_graph.data(), &process_graph_list.first, sizeof(process_graph_list.first));
-        total_size += serialized_graph.size();
-        serialized_list.push_back(std::move(serialized_graph));
-    }
-    raw_data_t total_data;
-    total_data.reserve(total_size); //process graphs with preambules + cluster size;
-    //fill global preambule: list size
-    for(size_t i = 0; i < sizeof(cluster_size); i++)
-    {
-        total_data.push_back(0);
-    }
-    memcpy(total_data.data(), &cluster_size, sizeof(cluster_size));
-    //use std::accumulate in c++20
-    for (const raw_data_t& data: serialized_list)
-    {
-        std::copy(data.begin(), data.end(), std::back_inserter(total_data));
-    }
-    / * [cluster_size] [process_id] [graphs_count] [graph_size_0] [graph_data_0] [graph_size_1] [graph_data_1] ... * /
-    return total_data;
-    */
-    return device_path_serializer::serialize_indices_impl(list);
-}
-
-device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const detail::colored_plain_graph_list& list,
-    size_t offset) {
-    return device_path_serializer::serialize_indices_impl(list, offset);
-}
-
-device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const detail::global_sorted_colored_plain_graphs& list) {
-    return device_path_serializer::serialize_indices_impl(list);
-}
-
-/* Deserializer */
-template <class T>
-std::list<T> device_path_deserializer::deserialize_generic_indices_list_impl(
-    const raw_data_t& data,
-    size_t& deserialized_bytes_count,
-    size_t offset,
-    size_t stride) {
-    std::list<T> list;
-    size_t list_size = 0;
-
-    // preconditions
-    if (data.size() < sizeof(list_size) + offset) {
-        throw std::runtime_error(std::string(__FUNCTION__) +
-                                 " - too short data size: " + std::to_string(data.size()) +
-                                 ", expected: " + std::to_string(sizeof(list_size)) +
-                                 ", with offset: " + std::to_string(offset));
-    }
-    memcpy(&list_size, data.data() + offset, sizeof(list_size));
-
-    auto data_it = data.begin();
-    std::advance(data_it, offset + sizeof(list_size));
-    deserialized_bytes_count += sizeof(list_size);
-
-    size_t deserialized_graphs_count = 0;
-    for (; data_it != data.end() and deserialized_graphs_count < list_size;) {
-        //get graph_size
-        size_t graph_size = 0;
-        size_t elapsed_byte_count = std::distance(data_it, data.end());
-        size_t expected_count = sizeof(graph_size);
-        if (elapsed_byte_count < expected_count) {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract graph_size, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()) +
-                                     ", with offset: " + std::to_string(offset));
-        }
-        memcpy(&graph_size, &(*data_it), expected_count);
-        std::advance(data_it, expected_count);
-        deserialized_bytes_count += expected_count;
-
-        //get graph_data
-        elapsed_byte_count = std::distance(data_it, data.end());
-        expected_count = (device_path_serializable::device_index_size() + stride) * graph_size;
-        if (elapsed_byte_count < expected_count) {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract graph_data, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()) +
-                                     ", with offset: " + std::to_string(offset));
-        }
-
-        //deserialize graph portion
-        auto data_end_it = data_it;
-        std::advance(data_end_it, expected_count);
-
-        T graph =
-            device_path_deserializer::deserialize_indices<std::vector, typename T::value_type>(
-                data_it, data_end_it, stride);
-
-        data_it = data_end_it;
-        deserialized_bytes_count += expected_count;
-
-        list.push_back(std::move(graph));
-        deserialized_graphs_count++;
-    }
-
-    // postconditions
-    if (list.size() != list_size) {
-        throw std::runtime_error(
-            std::string(__FUNCTION__) + " - unexpected deserilized graphs count: " +
-            std::to_string(list.size()) + ", expected: " + std::to_string(list_size));
-    }
-    return list;
-}
-
-template <class T>
-std::map<size_t, T> device_path_deserializer::deserialize_generic_indices_map_impl(
-    const raw_data_t& data,
-    size_t stride) {
-    std::map<size_t, T> global;
-    size_t global_size = 0;
-
-    // preconditions
-    if (data.size() < sizeof(global_size)) {
-        throw std::runtime_error(std::string(__FUNCTION__) +
-                                 " - too short data size: " + std::to_string(data.size()) +
-                                 ", expected: " + std::to_string(sizeof(global_size)));
-    }
-    memcpy(&global_size, data.data(), sizeof(global_size));
-
-    auto data_it = data.begin();
-    std::advance(data_it, sizeof(global_size));
-
-    size_t deserialized_processes_count = 0;
-    for (; data_it != data.end();) {
-        //get process
-        size_t process_id = 0;
-        size_t elapsed_byte_count = std::distance(data_it, data.end());
-        size_t expected_count = sizeof(process_id);
-        if (elapsed_byte_count < expected_count) {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract process_id, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()));
-        }
-        memcpy(&process_id, &(*data_it), expected_count);
-        std::advance(data_it, expected_count);
-
-        //get graph_data for process
-        size_t process_deserialized_count = 0;
-        T process_list = device_path_deserializer::template deserialize_generic_indices_list_impl<
-            typename T::value_type>(
-            raw_data_t(data_it, data.end()), process_deserialized_count, 0, stride);
-        std::advance(data_it, process_deserialized_count);
-        if (!global.emplace(process_id, std::move(process_list)).second) {
-            throw std::runtime_error(
-                std::string(__FUNCTION__) +
-                " - Cannot insert deserialized graphs list for process indx: " +
-                std::to_string(process_id));
-        }
-
-        deserialized_processes_count++;
-    }
-
-    // postconditions
-    if (global.size() != global_size) {
-        throw std::runtime_error(
-            std::string(__FUNCTION__) + " - unexpected deserialized cluster graphs count: " +
-            std::to_string(global.size()) + ", expected: " + std::to_string(global_size));
-    }
-
-    return global;
-}
-
-detail::plain_graph_list device_path_deserializer::deserialize_graph_list_indices(
-    const raw_data_t& data,
-    size_t& deserialized_bytes_count,
-    size_t offset) {
-    return device_path_deserializer::deserialize_generic_indices_list_impl<
-        typename detail::plain_graph_list::value_type>(data, deserialized_bytes_count, offset, 0);
-    /*detail::plain_graph_list list;
-    size_t list_size = 0;
-    // preconditions
-    if (data.size() < sizeof(list_size) + offset)
-    {
-        throw std::runtime_error(std::string(__FUNCTION__) + " - too short data size: " +
-                                 std::to_string(data.size()) +
-                                 ", expected: " + std::to_string(sizeof(list_size)) +
-                                 ", with offset: " + std::to_string(offset));
-    }
-    memcpy(&list_size, data.data() + offset, sizeof(list_size));
-    auto data_it = data.begin();
-    std::advance(data_it, offset + sizeof(list_size));
-    deserialized_bytes_count += sizeof(list_size);
-    size_t deserialized_graphs_count = 0;
-    for ( ; data_it != data.end() and deserialized_graphs_count < list_size; )
-    {
-        //get graph_size
-        size_t graph_size = 0;
-        size_t elapsed_byte_count = std::distance(data_it, data.end());
-        size_t expected_count = sizeof(graph_size);
-        if (elapsed_byte_count < expected_count)
-        {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract graph_size, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()) +
-                                     ", with offset: " + std::to_string(offset));
-        }
-        memcpy(&graph_size, &(*data_it), expected_count);
-        std::advance(data_it, expected_count);
-        deserialized_bytes_count += expected_count;
-        //get graph_data
-        elapsed_byte_count = std::distance(data_it, data.end());
-        expected_count = device_path_serializable::device_index_size() * graph_size;
-        if (elapsed_byte_count < expected_count)
-        {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract graph_data, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()) +
-                                     ", with offset: " + std::to_string(offset));
-        }
-        //deserialize graph_data
-        detail::plain_graph graph;
-        graph.reserve(graph_size);
-        //deserialize graph portion
-        auto data_end_it = data_it;
-        std::advance(data_end_it, expected_count);
-        for (size_t elem_index = 0; elem_index < graph_size; elem_index++)
-        {
-            graph.insert(graph.end(), device_path_deserializer::extract_index(data_it,
-                                                                              data_it + device_path_serializable::device_index_size()));
-            std::advance(data_it, device_path_serializable::device_index_size());
-            deserialized_bytes_count += device_path_serializable::device_index_size();
-        }
-        list.push_back(std::move(graph));
-        deserialized_graphs_count ++;
-    }
-    // postconditions
-    if (list.size() != list_size)
-    {
-         throw std::runtime_error(std::string(__FUNCTION__) + " - unexpected deserilized graphs count: " +
-                                  std::to_string(list.size()) + ", expected: " + std::to_string(list_size));
-    }
-    return list;*/
-}
-
-detail::global_sorted_plain_graphs device_path_deserializer::deserialize_global_graph_list_indices(
-    const raw_data_t& data) {
-    return device_path_deserializer::template deserialize_generic_indices_map_impl<
-        typename detail::global_sorted_plain_graphs::mapped_type>(data, 0);
-    /*
-    detail::global_sorted_plain_graphs global;
-    size_t global_size = 0;
-    size_t deserialized_bytes_count = 0;
-    // preconditions
-    if (data.size() < sizeof(global_size))
-    {
-        throw std::runtime_error(std::string(__FUNCTION__) + " - too short data size: " +
-                                 std::to_string(data.size()) +
-                                 ", expected: " + std::to_string(sizeof(global_size)));
-    }
-    memcpy(&global_size, data.data(), sizeof(global_size));
-    auto data_it = data.begin();
-    std::advance(data_it, sizeof(global_size));
-    deserialized_bytes_count += sizeof(global_size);
-    size_t deserialized_processes_count = 0;
-    for ( ; data_it != data.end(); )
-    {
-        //get process
-        size_t process_id = 0;
-        size_t elapsed_byte_count = std::distance(data_it, data.end());
-        size_t expected_count = sizeof(process_id);
-        if (elapsed_byte_count < expected_count)
-        {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot extract process_id, too short data elapsed: " +
-                                     std::to_string(elapsed_byte_count) +
-                                     ", expected: " + std::to_string(expected_count) +
-                                     ". initial data size: " + std::to_string(data.size()));
-        }
-        memcpy(&process_id, &(*data_it), expected_count);
-        std::advance(data_it, expected_count);
-        deserialized_bytes_count += expected_count;
-        //get graph_data for process
-        size_t process_deserialized_count = 0;
-        detail::plain_graph_list process_list =
-                device_path_deserializer::deserialize_graph_list_indices(raw_data_t(data_it,
-                                                                                    data.end()),
-                                                                         process_deserialized_count);
-        std::advance(data_it, process_deserialized_count);
-        deserialized_bytes_count += process_deserialized_count;
-        if (!global.emplace(process_id, std::move(process_list)).second)
-        {
-            throw std::runtime_error(std::string(__FUNCTION__) +
-                                     " - Cannot insert deserialized graphs list for process indx: " +
-                                     std::to_string(process_id));
-        }
-        deserialized_processes_count++;
-    }
-    // postconditions
-    if (global.size() != global_size)
-    {
-         throw std::runtime_error(std::string(__FUNCTION__) +
-                                  " - unexpected deserialized cluster graphs count: " +
-                                  std::to_string(global.size()) +
-                                  ", expected: " + std::to_string(global_size));
-    }
-    return global;
-    */
-}
-
-detail::colored_plain_graph_list device_path_deserializer::deserialize_colored_graph_list_indices(
-    const raw_data_t& list,
-    size_t& deserialized_bytes_count,
-    size_t offset) {
-    return device_path_deserializer::deserialize_generic_indices_list_impl<
-        typename detail::colored_plain_graph_list::value_type>(
-        list,
-        deserialized_bytes_count,
-        offset,
-        sizeof(detail::colored_idx) - sizeof(ccl::device_index_type));
-}
-
-detail::global_sorted_colored_plain_graphs
-device_path_deserializer::deserialize_global_colored_graph_list_indices(const raw_data_t& list) {
-    return device_path_deserializer::deserialize_generic_indices_map_impl<
-        typename detail::global_sorted_colored_plain_graphs::mapped_type>(
-        list, sizeof(detail::colored_idx) - sizeof(ccl::device_index_type));
-}
-
-detail::colored_idx device_path_deserializer::extract_index(raw_data_t::const_iterator it_begin,
-                                                            raw_data_t::const_iterator it_end,
-                                                            std::false_type raw_index) {
-    constexpr size_t color_size = sizeof(detail::color_t);
-    constexpr size_t stride = sizeof(detail::colored_idx) - sizeof(ccl::device_index_type);
-    if (std::distance(it_begin, it_end) %
-            (device_path_serializable::device_index_size() + stride) !=
-        0) {
-        assert(false && "Unexpected data bytes count!");
-        throw std::runtime_error(
-            std::string("Unexpected deserializing data bytes count: ") +
-            std::to_string(std::distance(it_begin, it_end)) + ", is not divided by:" +
-            std::to_string(device_path_serializable::device_index_size() + stride));
-    }
-
-    detail::color_t color = 0;
-    memcpy(&color, &(*it_begin), color_size);
-    std::advance(it_begin, stride);
-
-    return detail::colored_idx(
-        color, device_path_deserializer::extract_index(it_begin, it_end, std::true_type{}));
-}
-
-ccl::device_index_type device_path_deserializer::extract_index(raw_data_t::const_iterator it_begin,
-                                                               raw_data_t::const_iterator it_end,
-                                                               std::true_type raw_index) {
-    if ((std::distance(it_begin, it_end) % device_path_serializable::device_index_size()) != 0) {
-        assert(false && "Unexpected data bytes count!");
-        throw std::runtime_error(
-            std::string("Unexpected deserializing data bytes count: ") +
-            std::to_string(std::distance(it_begin, it_end)) +
-            ", is not divided by:" + std::to_string(device_path_serializable::device_index_size()));
-    }
-
-    ccl::device_index_type path;
-    for (auto raw_data_it = it_begin; raw_data_it != it_end;) {
-        ccl::index_type index;
-        std::copy(raw_data_it,
-                  raw_data_it + device_path_serializable::index_size(),
-                  reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += device_path_serializable::index_size();
-        std::get<ccl::device_index_enum::driver_index_id>(path) = index;
-
-        std::copy(raw_data_it,
-                  raw_data_it + device_path_serializable::index_size(),
-                  reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += device_path_serializable::index_size();
-        std::get<ccl::device_index_enum::device_index_id>(path) = index;
-
-        std::copy(raw_data_it,
-                  raw_data_it + device_path_serializable::index_size(),
-                  reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += device_path_serializable::index_size();
-        std::get<ccl::device_index_enum::subdevice_index_id>(path) = index;
-    }
-    return path;
-}
-/*
-ccl::device_indices_type device_path_deserializer::operator()(const std::vector<unsigned char>& raw_data)
-{
-    size_t elem_count = base::get_indices_count(raw_data.size());
-    ccl::device_indices_type data;
-    constexpr auto offset = sizeof(ccl::index_type) / sizeof(unsigned char);
-    for(auto raw_data_it = raw_data.begin(); raw_data_it != raw_data.end(); )
-    {
-        ccl::device_index_type path;
-        ccl::index_type index;
-        std::copy(raw_data_it, raw_data_it + offset, reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += offset;
-        std::get<ccl::device_index_enum::driver_index_id>(path) = index;
-        std::copy(raw_data_it, raw_data_it + offset, reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += offset;
-        std::get<ccl::device_index_enum::device_index_id>(path) = index;
-        std::copy(raw_data_it, raw_data_it + offset, reinterpret_cast<unsigned char*>(&index));
-        raw_data_it += offset;
-        std::get<ccl::device_index_enum::subdevice_index_id>(path) = index;
-        data.insert(std::move(path));
-    }
-    return data;
-}
-*/
-} // namespace serialize
-} // namespace detail
-} // namespace native
diff --git a/src/common/comm/l0/topology/topology_serializer.hpp b/src/common/comm/l0/topology/topology_serializer.hpp
deleted file mode 100644
index 78f3cdf57..000000000
--- a/src/common/comm/l0/topology/topology_serializer.hpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "common/comm/l0/topology/topology_construction_utils.hpp"
-namespace native {
-namespace detail {
-namespace serialize {
-struct device_path_serializable {
-    using raw_data_t = std::vector<unsigned char>;
-
-    static constexpr size_t index_size() {
-        return sizeof(ccl::index_type) / sizeof(unsigned char);
-    }
-
-    static constexpr size_t device_index_size() {
-        return std::tuple_size<ccl::device_index_type>::value * index_size();
-    }
-
-    static size_t get_indices_count(size_t raw_data_size, size_t stride = 0) {
-        if (raw_data_size % (device_index_size() + stride)) {
-            assert(false && "Unexpected deserializing bytes count!");
-            throw std::runtime_error(
-                std::string("Unexpected deserializing bytes count: ") +
-                std::to_string(raw_data_size) + ", extra bytes :" +
-                std::to_string((raw_data_size % (device_index_size() + stride))) +
-                ", stride: " + std::to_string(stride));
-        }
-        return raw_data_size / (device_index_size() + stride);
-    }
-
-    raw_data_t result();
-
-protected:
-    raw_data_t data;
-};
-
-struct device_path_serializer : device_path_serializable {
-    using base = device_path_serializable;
-    device_path_serializer(size_t expected_devices, size_t data_offset, size_t stride = 0);
-
-    template <template <class...> class container>
-    static raw_data_t serialize_indices(const container<ccl::device_index_type>& indices,
-                                        size_t additional_reserved_bytes = 0) {
-        device_path_serializer consumer(indices.size(), additional_reserved_bytes);
-        for (const auto& path : indices) {
-            ccl_tuple_for_each(path, consumer);
-        }
-        return consumer.result();
-    }
-
-    template <template <class...> class container>
-    static raw_data_t serialize_indices(const container<detail::colored_idx>& indices,
-                                        size_t additional_reserved_bytes = 0) {
-        static_assert(sizeof(detail::colored_idx) >= sizeof(ccl::device_index_type),
-                      "'stride' must be positive or zero");
-        constexpr size_t stride = sizeof(detail::colored_idx) - sizeof(ccl::device_index_type);
-        device_path_serializer consumer(indices.size(), additional_reserved_bytes, stride);
-        for (const auto& path : indices) {
-            //serialize color
-            size_t offset = consumer.data.size();
-            for (size_t skip_bytes = 0; skip_bytes < consumer.stride_bytes; skip_bytes++) {
-                consumer.data.push_back(0);
-            }
-            memcpy(consumer.data.data() + offset, &path.color, sizeof(path.color));
-
-            //serialize index
-            ccl_tuple_for_each(path.index, consumer);
-        }
-        return consumer.result();
-    }
-
-    static raw_data_t serialize_indices(const detail::plain_graph_list& list, size_t offset = 0);
-    static raw_data_t serialize_indices(const detail::global_sorted_plain_graphs& list);
-    static raw_data_t serialize_indices(const detail::colored_plain_graph_list& list,
-                                        size_t offset = 0);
-    static raw_data_t serialize_indices(const detail::global_sorted_colored_plain_graphs& list);
-
-    template <class index_type>
-    void operator()(const index_type& value) {
-        static_assert(std::is_same<index_type, ccl::index_type>::value,
-                      "Only ccl::index_type is supported");
-
-        data.insert(data.end(),
-                    reinterpret_cast<const unsigned char*>(&value),
-                    reinterpret_cast<const unsigned char*>(&value) + sizeof(index_type));
-    }
-
-private:
-    template <class T>
-    static raw_data_t serialize_indices_impl(const std::list<T>& list, size_t offset = 0);
-    template <class T>
-    static raw_data_t serialize_indices_impl(const std::map<size_t, T>& list);
-
-    size_t expected_capacity;
-    size_t stride_bytes;
-};
-
-struct device_path_deserializer : device_path_serializable {
-    using base = device_path_serializable;
-    /*
-    template<template<class...> class container>
-    static container<ccl::device_index_type>
-            deserialize_indices(const device_path_serializable::raw_data_t& data)
-    {
-        size_t elem_count = base::get_indices_count(data.size());
-        container<ccl::device_index_type> ret;
-        for (size_t elem_index = 0; elem_index < elem_count; elem_index++)
-        {
-            auto start_it = data.begin();
-            std::advance(start_it, elem_index * device_path_serializable::device_index_size());
-            ret.insert(ret.end(),
-                       device_path_deserializer::extract_index(start_it,
-                                                               start_it + device_index_size()));
-        }
-        return ret;
-    }
-   */
-    template <template <class...> class container, class index_type, class iterator>
-    static container<index_type> deserialize_indices(
-        iterator it_begin,
-        iterator it_end,
-        size_t stride = sizeof(index_type) - sizeof(ccl::device_index_type)) {
-        static_assert(sizeof(index_type) >= sizeof(ccl::device_index_type),
-                      "'stride' must be positive or zero");
-        size_t elem_count = base::get_indices_count(std::distance(it_begin, it_end), stride);
-        container<index_type> ret;
-
-        for (size_t elem_index = 0; elem_index < elem_count; elem_index++) {
-            auto start_it = it_begin;
-            std::advance(start_it,
-                         elem_index * (device_path_serializable::device_index_size() + stride));
-            ret.insert(ret.end(),
-                       device_path_deserializer::extract_index(
-                           start_it,
-                           start_it + device_index_size() + stride,
-                           std::integral_constant<
-                               bool,
-                               std::is_same<index_type, ccl::device_index_type>::value>{}));
-        }
-        return ret;
-    }
-
-    static detail::plain_graph_list deserialize_graph_list_indices(const raw_data_t& list,
-                                                                   size_t& deserialized_bytes_count,
-                                                                   size_t offset = 0);
-    static detail::global_sorted_plain_graphs deserialize_global_graph_list_indices(
-        const raw_data_t& list);
-
-    static detail::colored_plain_graph_list deserialize_colored_graph_list_indices(
-        const raw_data_t& list,
-        size_t& deserialized_bytes_count,
-        size_t offset = 0);
-    static detail::global_sorted_colored_plain_graphs deserialize_global_colored_graph_list_indices(
-        const raw_data_t& list);
-
-    static ccl::device_index_type extract_index(raw_data_t::const_iterator it_begin,
-                                                raw_data_t::const_iterator it_end,
-                                                std::true_type raw_index);
-    static detail::colored_idx extract_index(raw_data_t::const_iterator it_begin,
-                                             raw_data_t::const_iterator it_end,
-                                             std::false_type colored_index);
-
-private:
-    template <class T>
-    static std::list<T> deserialize_generic_indices_list_impl(const raw_data_t& list,
-                                                              size_t& deserialized_bytes_count,
-                                                              size_t offset = 0,
-                                                              size_t stride = 0);
-    template <class T>
-    static std::map<size_t, T> deserialize_generic_indices_map_impl(const raw_data_t& list,
-                                                                    size_t stride = 0);
-};
-} // namespace serialize
-} // namespace detail
-} // namespace native
diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp
index 849b363ca..192ca03de 100644
--- a/src/common/datatype/datatype.cpp
+++ b/src/common/datatype/datatype.cpp
@@ -109,10 +109,9 @@ ccl_datatype_storage::ccl_datatype_storage() {
         const ccl_datatype& dtype = get(idx);
         const std::string& dtype_name = name(dtype);
 
+        CCL_THROW_IF_NOT(dtype == idx, "unexpected datatype idx ", dtype.idx(), ", expected ", idx);
         CCL_THROW_IF_NOT(
-            dtype.idx() == idx, "unexpected datatype idx ", dtype.idx(), ", expected ", idx);
-        CCL_THROW_IF_NOT(
-            dtype.idx() == idx, "unexpected datatype size ", dtype.size(), ", expected ", size);
+            dtype.size() == size, "unexpected datatype size ", dtype.size(), ", expected ", size);
         CCL_THROW_IF_NOT(!dtype_name.compare(name_str),
                          "unexpected datatype name ",
                          dtype_name,
diff --git a/src/common/datatype/datatype.hpp b/src/common/datatype/datatype.hpp
index 974cef398..32610e490 100644
--- a/src/common/datatype/datatype.hpp
+++ b/src/common/datatype/datatype.hpp
@@ -48,6 +48,22 @@ class ccl_datatype {
     size_t m_size = sizeof(int8_t);
 };
 
+inline bool operator==(const ccl_datatype& lhs, const ccl::datatype& rhs) {
+    return lhs.idx() == rhs;
+}
+
+inline bool operator!=(const ccl_datatype& lhs, const ccl::datatype& rhs) {
+    return !(lhs == rhs);
+}
+
+inline bool operator==(const ccl_datatype& lhs, const ccl_datatype& rhs) {
+    return lhs.idx() == rhs.idx();
+}
+
+inline bool operator!=(const ccl_datatype& lhs, const ccl_datatype& rhs) {
+    return !(lhs == rhs);
+}
+
 /* frequently used in multiple places */
 extern ccl_datatype ccl_datatype_int8;
 
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index 97bd1cae0..c173a488e 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -55,12 +55,6 @@ std::map<ccl_staging_buffer, std::string> env_data::staging_buffer_names = {
     std::make_pair(ccl_staging_usm, "usm")
 };
 
-std::map<atl_mnic_t, std::string> env_data::mnic_type_names = {
-    std::make_pair(ATL_MNIC_NONE, "none"),
-    std::make_pair(ATL_MNIC_LOCAL, "local"),
-    std::make_pair(ATL_MNIC_GLOBAL, "global")
-};
-
 std::map<ccl_ze_copy_engine_mode, std::string> env_data::ze_copy_engine_names = {
     std::make_pair(ccl_ze_copy_engine_none, "none"),
     std::make_pair(ccl_ze_copy_engine_main, "main"),
@@ -95,7 +89,9 @@ env_data::env_data()
 
           mnic_type(ATL_MNIC_NONE),
           mnic_count(CCL_ENV_SIZET_NOT_SPECIFIED),
+          mnic_offset(ATL_MNIC_OFFSET_NONE),
 
+          enable_algo_fallback(1),
           enable_unordered_coll(0),
 
           enable_fusion(0),
@@ -129,22 +125,35 @@ env_data::env_data()
 
           allreduce_2d_base_size(CCL_ENV_SIZET_NOT_SPECIFIED),
           allreduce_2d_switch_dims(0),
+          allreduce_nreduce_buffering(0),
+          allreduce_nreduce_segment_size(CCL_ENV_SIZET_NOT_SPECIFIED),
 
           alltoall_scatter_max_ops(CCL_ENV_SIZET_NOT_SPECIFIED),
           alltoall_scatter_plain(0),
 
+#ifdef CCL_ENABLE_SYCL
           kernel_path(),
           kernel_debug(0),
-          enable_kernel_cache(1),
           kernel_group_size(CCL_ENV_SIZET_NOT_SPECIFIED),
           kernel_group_count(CCL_ENV_SIZET_NOT_SPECIFIED),
           enable_kernel_sync(1),
           kernel_1s_lead(0),
           enable_kernel_1s_copy_ops(0),
           enable_kernel_1s_ipc_wa(0),
-          enable_kernel_output_event(0),
+          enable_kernel_profile(0),
+          enable_close_fd_wa(0),
+
+          enable_sycl_output_event(0),
+
+          enable_ze_barrier(0),
+          enable_ze_cache(1),
+          enable_ze_single_list(1),
+          disable_ze_family_check(0),
           ze_serialize_mode(0),
           ze_copy_engine(ccl_ze_copy_engine_none),
+          ze_queue_index(1),
+          ze_close_ipc_wa(0),
+#endif // CCL_ENABLE_SYCL
 
           bf16_impl_type(ccl_bf16_no_compiler_support),
           fp16_impl_type(ccl_fp16_no_compiler_support) {
@@ -203,7 +212,9 @@ void env_data::parse() {
     if (mnic_count == CCL_ENV_SIZET_NOT_SPECIFIED) {
         mnic_count = worker_count;
     }
+    env_2_enum(CCL_MNIC_OFFSET, mnic_offset_names, mnic_offset);
 
+    env_2_type(CCL_ALGO_FALLBACK, enable_algo_fallback);
     env_2_type(CCL_ALLGATHERV, allgatherv_algo_raw);
     env_2_type(CCL_ALLREDUCE, allreduce_algo_raw);
     env_2_type(CCL_ALLTOALL, alltoall_algo_raw);
@@ -278,28 +289,45 @@ void env_data::parse() {
 
     env_2_type(CCL_ALLREDUCE_2D_BASE_SIZE, (size_t&)allreduce_2d_base_size);
     env_2_type(CCL_ALLREDUCE_2D_SWITCH_DIMS, allreduce_2d_switch_dims);
+    env_2_type(CCL_ALLREDUCE_NREDUCE_BUFFERING, allreduce_nreduce_buffering);
+    env_2_type(CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE, (size_t&)allreduce_nreduce_segment_size);
 
     env_2_type(CCL_ALLTOALL_SCATTER_MAX_OPS, (size_t&)alltoall_scatter_max_ops);
     env_2_type(CCL_ALLTOALL_SCATTER_PLAIN, alltoall_scatter_plain);
 
+#ifdef CCL_ENABLE_SYCL
     env_2_type(CCL_KERNEL_PATH, kernel_path);
     if (kernel_path.empty()) {
-        std::string ccl_root = getenv("CCL_ROOT");
+        std::string ccl_root;
+        char* ccl_root_env_value = getenv("CCL_ROOT");
+        if (ccl_root_env_value) {
+            ccl_root = ccl_root_env_value;
+        }
         CCL_THROW_IF_NOT(!ccl_root.empty(), "incorrect comm kernels path, CCL_ROOT not found!");
         kernel_path = ccl_root + "/lib/kernels/";
     }
 
     env_2_type(CCL_KERNEL_DEBUG, kernel_debug);
-    env_2_type(CCL_KERNEL_CACHE, enable_kernel_cache);
     env_2_type(CCL_KERNEL_GROUP_SIZE, kernel_group_size);
     env_2_type(CCL_KERNEL_GROUP_COUNT, kernel_group_count);
     env_2_type(CCL_KERNEL_SYNC, enable_kernel_sync);
     env_2_type(CCL_KERNEL_1S_LEAD, kernel_1s_lead);
     env_2_type(CCL_KERNEL_1S_USE_COPY_OPS, enable_kernel_1s_copy_ops);
     env_2_type(CCL_KERNEL_1S_IPC_WA, enable_kernel_1s_ipc_wa);
-    env_2_type(CCL_KERNEL_OUTPUT_EVENT, enable_kernel_output_event);
+    env_2_type(CCL_KERNEL_PROFILE, enable_kernel_profile);
+    env_2_type(CCL_KERNEL_CLOSE_FD_WA, enable_close_fd_wa);
+
+    env_2_type(CCL_SYCL_OUTPUT_EVENT, enable_sycl_output_event);
+
+    env_2_type(CCL_ZE_BARRIER, enable_ze_barrier);
+    env_2_type(CCL_ZE_CACHE, enable_ze_cache);
+    env_2_type(CCL_ZE_SINGLE_LIST, enable_ze_single_list);
+    env_2_type(CCL_ZE_DISABLE_FAMILY_CHECK, disable_ze_family_check);
     env_2_type(CCL_ZE_SERIALIZE, ze_serialize_mode);
     env_2_enum(CCL_ZE_COPY_ENGINE, ze_copy_engine_names, ze_copy_engine);
+    env_2_type(CCL_ZE_QUEUE_INDEX, ze_queue_index);
+    env_2_type(CCL_ZE_CLOSE_IPC_WA, ze_close_ipc_wa);
+#endif // CCL_ENABLE_SYCL
 
     auto bf16_impl_types = ccl_bf16_get_impl_types();
     ccl_bf16_impl_type bf16_env_impl_type;
@@ -400,7 +428,9 @@ void env_data::print(int rank) {
     LOG_INFO(
         CCL_MNIC_NAME, ": ", (mnic_name_raw.length()) ? mnic_name_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_MNIC_COUNT, ": ", mnic_count);
+    LOG_INFO(CCL_MNIC_OFFSET, ": ", str_by_enum(mnic_offset_names, mnic_offset));
 
+    LOG_INFO(CCL_ALGO_FALLBACK, ": ", enable_algo_fallback);
     LOG_INFO(CCL_ALLGATHERV,
              ": ",
              (allgatherv_algo_raw.length()) ? allgatherv_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
@@ -464,6 +494,12 @@ void env_data::print(int rank) {
                  ? std::to_string(allreduce_2d_base_size)
                  : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_ALLREDUCE_2D_SWITCH_DIMS, ": ", allreduce_2d_switch_dims);
+    LOG_INFO(CCL_ALLREDUCE_NREDUCE_BUFFERING, ": ", allreduce_nreduce_buffering);
+    LOG_INFO(CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE,
+             ": ",
+             (allreduce_nreduce_segment_size != CCL_ENV_SIZET_NOT_SPECIFIED)
+                 ? std::to_string(allreduce_nreduce_segment_size)
+                 : CCL_ENV_STR_NOT_SPECIFIED);
 
     LOG_INFO(CCL_ALLTOALL_SCATTER_MAX_OPS,
              ": ",
@@ -476,7 +512,6 @@ void env_data::print(int rank) {
     LOG_INFO(
         CCL_KERNEL_PATH, ": ", (!kernel_path.empty()) ? kernel_path : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_KERNEL_DEBUG, ": ", kernel_debug);
-    LOG_INFO(CCL_KERNEL_CACHE, ": ", enable_kernel_cache);
     LOG_INFO(CCL_KERNEL_GROUP_SIZE,
              ": ",
              (kernel_group_size != CCL_ENV_SIZET_NOT_SPECIFIED) ? std::to_string(kernel_group_size)
@@ -490,9 +525,19 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_KERNEL_1S_LEAD, ": ", kernel_1s_lead);
     LOG_INFO(CCL_KERNEL_1S_USE_COPY_OPS, ": ", enable_kernel_1s_copy_ops);
     LOG_INFO(CCL_KERNEL_1S_IPC_WA, ": ", enable_kernel_1s_ipc_wa);
-    LOG_INFO(CCL_KERNEL_OUTPUT_EVENT, ": ", enable_kernel_output_event);
+    LOG_INFO(CCL_KERNEL_PROFILE, ": ", enable_kernel_profile);
+    LOG_INFO(CCL_KERNEL_CLOSE_FD_WA, ": ", enable_close_fd_wa);
+
+    LOG_INFO(CCL_SYCL_OUTPUT_EVENT, ": ", enable_sycl_output_event);
+
+    LOG_INFO(CCL_ZE_BARRIER, ": ", enable_ze_barrier);
+    LOG_INFO(CCL_ZE_CACHE, ": ", enable_ze_cache);
+    LOG_INFO(CCL_ZE_SINGLE_LIST, ": ", enable_ze_single_list);
+    LOG_INFO(CCL_ZE_DISABLE_FAMILY_CHECK, ": ", disable_ze_family_check);
     LOG_INFO(CCL_ZE_SERIALIZE, ": ", ze_serialize_mode);
     LOG_INFO(CCL_ZE_COPY_ENGINE, ": ", str_by_enum(ze_copy_engine_names, ze_copy_engine));
+    LOG_INFO(CCL_ZE_QUEUE_INDEX, ": ", ze_queue_index);
+    LOG_INFO(CCL_ZE_CLOSE_IPC_WA, ": ", ze_close_ipc_wa);
 #endif // CCL_ENABLE_SYCL
 
     LOG_INFO(CCL_BF16, ": ", str_by_enum(bf16_impl_names, bf16_impl_type));
@@ -516,7 +561,7 @@ void env_data::print(int rank) {
 
 void env_data::set_internal_env() {
     auto attr = ccl_executor::generate_atl_attr(*this);
-    atl_wrapper::set_internal_env(attr);
+    atl_comm_manager::set_internal_env(attr);
     if (log_level >= ccl_log_level::info) {
         setenv("I_MPI_DEBUG", "4", 0);
     }
@@ -700,12 +745,14 @@ int env_data::env_2_worker_affinity(int local_proc_idx, int local_proc_count) {
     return 1;
 }
 
-int env_data::env_2_worker_mem_affinity() {
+int env_data::env_2_worker_mem_affinity(int local_proc_count) {
     CCL_THROW_IF_NOT(worker_affinity.size() > 0);
+    CCL_THROW_IF_NOT(local_proc_count > 0);
 
     size_t idx;
     char* env_to_parse = getenv(CCL_WORKER_MEM_AFFINITY);
-    size_t affinity_size = worker_affinity.size();
+    size_t affinity_size = local_proc_count * worker_count;
+    CCL_THROW_IF_NOT(affinity_size <= worker_affinity.size());
 
     if (!env_to_parse || (strlen(env_to_parse) == 0) || (strcmp(env_to_parse, "auto") == 0)) {
         worker_mem_affinity.assign(affinity_size, CCL_UNDEFINED_NUMA_NODE);
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index f69ed88ee..4e93b9561 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -62,7 +62,9 @@ constexpr const char* CCL_ATL_CACHE = "CCL_ATL_CACHE";
 constexpr const char* CCL_MNIC = "CCL_MNIC";
 constexpr const char* CCL_MNIC_NAME = "CCL_MNIC_NAME";
 constexpr const char* CCL_MNIC_COUNT = "CCL_MNIC_COUNT";
+constexpr const char* CCL_MNIC_OFFSET = "CCL_MNIC_OFFSET";
 
+constexpr const char* CCL_ALGO_FALLBACK = "CCL_ALGO_FALLBACK";
 constexpr const char* CCL_ALLGATHERV = "CCL_ALLGATHERV";
 constexpr const char* CCL_ALLREDUCE = "CCL_ALLREDUCE";
 constexpr const char* CCL_ALLTOALL = "CCL_ALLTOALL";
@@ -102,22 +104,33 @@ constexpr const char* CCL_AR2D_MIN_CHUNK_SIZE = "CCL_AR2D_MIN_CHUNK_SIZE";
 constexpr const char* CCL_ALLREDUCE_2D_BASE_SIZE = "CCL_ALLREDUCE_2D_BASE_SIZE";
 constexpr const char* CCL_ALLREDUCE_2D_SWITCH_DIMS = "CCL_ALLREDUCE_2D_SWITCH_DIMS";
 
+constexpr const char* CCL_ALLREDUCE_NREDUCE_BUFFERING = "CCL_ALLREDUCE_NREDUCE_BUFFERING";
+constexpr const char* CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE = "CCL_ALLREDUCE_NREDUCE_SEGMENT_SIZE";
+
 constexpr const char* CCL_ALLTOALL_SCATTER_MAX_OPS = "CCL_ALLTOALL_SCATTER_MAX_OPS";
 constexpr const char* CCL_ALLTOALL_SCATTER_PLAIN = "CCL_ALLTOALL_SCATTER_PLAIN";
 
-constexpr const char* CCL_COMM_KERNELS = "CCL_COMM_KERNELS";
 constexpr const char* CCL_KERNEL_PATH = "CCL_KERNEL_PATH";
 constexpr const char* CCL_KERNEL_DEBUG = "CCL_KERNEL_DEBUG";
-constexpr const char* CCL_KERNEL_CACHE = "CCL_KERNEL_CACHE";
 constexpr const char* CCL_KERNEL_GROUP_SIZE = "CCL_KERNEL_GROUP_SIZE";
 constexpr const char* CCL_KERNEL_GROUP_COUNT = "CCL_KERNEL_GROUP_COUNT";
 constexpr const char* CCL_KERNEL_SYNC = "CCL_KERNEL_SYNC";
 constexpr const char* CCL_KERNEL_1S_LEAD = "CCL_KERNEL_1S_LEAD";
 constexpr const char* CCL_KERNEL_1S_USE_COPY_OPS = "CCL_KERNEL_1S_USE_COPY_OPS";
 constexpr const char* CCL_KERNEL_1S_IPC_WA = "CCL_KERNEL_1S_IPC_WA";
-constexpr const char* CCL_KERNEL_OUTPUT_EVENT = "CCL_KERNEL_OUTPUT_EVENT";
+constexpr const char* CCL_KERNEL_PROFILE = "CCL_KERNEL_PROFILE";
+constexpr const char* CCL_KERNEL_CLOSE_FD_WA = "CCL_KERNEL_CLOSE_FD_WA";
+
+constexpr const char* CCL_SYCL_OUTPUT_EVENT = "CCL_SYCL_OUTPUT_EVENT";
+
+constexpr const char* CCL_ZE_BARRIER = "CCL_ZE_BARRIER";
+constexpr const char* CCL_ZE_CACHE = "CCL_ZE_CACHE";
 constexpr const char* CCL_ZE_SERIALIZE = "CCL_ZE_SERIALIZE";
 constexpr const char* CCL_ZE_COPY_ENGINE = "CCL_ZE_COPY_ENGINE";
+constexpr const char* CCL_ZE_QUEUE_INDEX = "CCL_ZE_QUEUE_INDEX";
+constexpr const char* CCL_ZE_CLOSE_IPC_WA = "CCL_ZE_CLOSE_IPC_WA";
+constexpr const char* CCL_ZE_SINGLE_LIST = "CCL_ZE_SINGLE_LIST";
+constexpr const char* CCL_ZE_DISABLE_FAMILY_CHECK = "CCL_ZE_DISABLE_FAMILY_CHECK";
 
 constexpr const char* CCL_BF16 = "CCL_BF16";
 constexpr const char* CCL_FP16 = "CCL_FP16";
@@ -185,12 +198,14 @@ class env_data {
     atl_mnic_t mnic_type;
     std::string mnic_name_raw;
     ssize_t mnic_count;
+    atl_mnic_offset_t mnic_offset;
 
     /*
        parsing logic can be quite complex
        so hide it inside algorithm_selector module
        and store only raw strings in env_data
     */
+    int enable_algo_fallback;
     std::string allgatherv_algo_raw;
     std::string allreduce_algo_raw;
     std::string alltoall_algo_raw;
@@ -229,22 +244,35 @@ class env_data {
 
     ssize_t allreduce_2d_base_size;
     int allreduce_2d_switch_dims;
+    int allreduce_nreduce_buffering;
+    ssize_t allreduce_nreduce_segment_size;
 
     ssize_t alltoall_scatter_max_ops;
     int alltoall_scatter_plain;
 
+#ifdef CCL_ENABLE_SYCL
     std::string kernel_path;
     int kernel_debug;
-    int enable_kernel_cache;
     ssize_t kernel_group_size;
     ssize_t kernel_group_count;
     int enable_kernel_sync;
     int kernel_1s_lead;
     int enable_kernel_1s_copy_ops;
     int enable_kernel_1s_ipc_wa;
-    int enable_kernel_output_event;
+    int enable_kernel_profile;
+    int enable_close_fd_wa;
+
+    int enable_sycl_output_event;
+
+    int enable_ze_barrier;
+    int enable_ze_cache;
+    int enable_ze_single_list;
+    int disable_ze_family_check;
     int ze_serialize_mode;
     ccl_ze_copy_engine_mode ze_copy_engine;
+    int ze_queue_index;
+    int ze_close_ipc_wa;
+#endif // CCL_ENABLE_SYCL
 
     ccl_bf16_impl_type bf16_impl_type;
     ccl_fp16_impl_type fp16_impl_type;
@@ -321,10 +349,9 @@ class env_data {
     static std::map<ccl_atl_send_proxy, std::string> atl_send_proxy_names;
     static std::map<ccl_staging_buffer, std::string> staging_buffer_names;
     static std::map<ccl_ze_copy_engine_mode, std::string> ze_copy_engine_names;
-    static std::map<atl_mnic_t, std::string> mnic_type_names;
 
     int env_2_worker_affinity(int local_proc_idx, int local_proc_count);
-    int env_2_worker_mem_affinity();
+    int env_2_worker_mem_affinity(int local_proc_count);
     void env_2_atl_transport();
 
 private:
diff --git a/src/common/event/impls/host_event.cpp b/src/common/event/impls/host_event.cpp
index 80aff3603..244fcf516 100644
--- a/src/common/event/impls/host_event.cpp
+++ b/src/common/event/impls/host_event.cpp
@@ -23,20 +23,21 @@ namespace ccl {
 host_event_impl::host_event_impl(ccl_request* r) : req(r) {
     if (!req) {
         // if the user calls collective with coll_attr->synchronous=1 then it will be progressed
-        // in place and API will return null event. In this case mark cpp wrapper as completed,
+        // in place and API will return null event. In this case mark request as completed,
         // all calls to wait() or test() will do nothing
         completed = true;
     }
 }
 
 host_event_impl::~host_event_impl() {
-    // TODO: need to find a way to syncronize these 2 statuses, right now there are
+    // TODO: need to find a way to synchronize these 2 statuses, right now there are
     // some issues, e.g. in case of pure host event get_native() is an empty sycl
     // event which always complete, this way LOG_ERROR is never called
     if (!completed
 #ifdef CCL_ENABLE_SYCL
-        && (!utils::is_sycl_event_completed(get_native()))
-#endif
+        && (ccl::global_data::env().enable_sycl_output_event &&
+            !utils::is_sycl_event_completed(get_native()))
+#endif // CCL_ENABLE_SYCL
     ) {
         LOG_ERROR("not completed event is destroyed");
     }
@@ -62,15 +63,15 @@ bool host_event_impl::cancel() {
 
 event::native_t& host_event_impl::get_native() {
 #ifdef CCL_ENABLE_SYCL
-    if (ccl::global_data::env().enable_kernel_output_event) {
+    if (ccl::global_data::env().enable_sycl_output_event) {
         return req->get_native_event();
     }
     else {
-        CCL_THROW("get_native() is not available without CCL_KERNEL_OUTPUT_EVENT=1 env variable");
+        CCL_THROW("get_native() is not available without CCL_SYCL_OUTPUT_EVENT=1 env variable");
     }
-#else
+#else // CCL_ENABLE_SYCL
     throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
-#endif
+#endif // CCL_ENABLE_SYCL
 }
 
 } // namespace ccl
diff --git a/src/common/global/global.cpp b/src/common/global/global.cpp
index 516aac8ee..ce1a40d1b 100644
--- a/src/common/global/global.cpp
+++ b/src/common/global/global.cpp
@@ -23,13 +23,16 @@
 #include "exec/exec.hpp"
 #include "fusion/fusion.hpp"
 #include "parallelizer/parallelizer.hpp"
-#include "sched/buffer_cache.hpp"
+#include "sched/buffer/buffer_cache.hpp"
 #include "sched/cache/cache.hpp"
 
-#ifdef MULTI_GPU_SUPPORT
-#include "sched/entry/gpu/ze_cache.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
-#endif // MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#ifdef CCL_ENABLE_SYCL
+#include "sched/sched_timer.hpp"
+#endif // CCL_ENABLE_SYCL
+#endif // CCL_ENABLE_ZE
 
 namespace ccl {
 
@@ -65,9 +68,9 @@ ccl::status global_data::reset() {
     reset_resize_dependent_objects();
     reset_resize_independent_objects();
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     finalize_gpu();
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 
     return ccl::status::success;
 }
@@ -76,9 +79,9 @@ ccl::status global_data::init() {
     env_object.parse();
     env_object.set_internal_env();
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     init_gpu();
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 
     init_resize_dependent_objects();
     init_resize_independent_objects();
@@ -128,7 +131,7 @@ void global_data::reset_resize_independent_objects() {
     hwloc_wrapper.reset();
 }
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
 void global_data::init_gpu() {
     LOG_INFO("initializing level-zero");
     ze_result_t res = zeInit(ZE_INIT_FLAG_GPU_ONLY);
@@ -137,13 +140,20 @@ void global_data::init_gpu() {
     }
     ze_cache = std::unique_ptr<ccl::ze::cache>(new ccl::ze::cache(env_object.worker_count));
     LOG_INFO("initialized level-zero");
+
+#if defined(CCL_ENABLE_SYCL)
+    timer_printer = std::unique_ptr<ccl::kernel_timer_printer>(new ccl::kernel_timer_printer);
+#endif // CCL_ENABLE_SYCL
 }
 
 void global_data::finalize_gpu() {
     LOG_INFO("finalizing level-zero");
     ze_cache.reset();
+#if defined(CCL_ENABLE_SYCL)
+    timer_printer.reset();
+#endif // CCL_ENABLE_SYCL
     LOG_INFO("finalized level-zero");
 }
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 
 } // namespace ccl
diff --git a/src/common/global/global.hpp b/src/common/global/global.hpp
index a2e29b2e3..9f1b81de4 100644
--- a/src/common/global/global.hpp
+++ b/src/common/global/global.hpp
@@ -15,10 +15,9 @@
 */
 #pragma once
 
-#include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
 #include "common/env/env.hpp"
 #include "common/utils/utils.hpp"
-#include "common/comm/l0/comm_context_storage.hpp"
 #include "hwloc/hwloc_wrapper.hpp"
 #include "internal_types.hpp"
 
@@ -54,6 +53,7 @@ class ccl_algorithm_selector_wrapper;
 namespace ccl {
 
 class buffer_cache;
+class kernel_timer_printer;
 
 namespace ze {
 class cache;
@@ -91,9 +91,12 @@ class global_data {
     std::unique_ptr<ccl_hwloc_wrapper> hwloc_wrapper;
     std::atomic<size_t> kernel_counter;
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     std::unique_ptr<ze::cache> ze_cache;
-#endif // MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_SYCL
+    std::unique_ptr<ccl::kernel_timer_printer> timer_printer;
+#endif // CCL_ENABLE_SYCL
+#endif // CCL_ENABLE_ZE
 
     static thread_local bool is_worker_thread;
     bool is_ft_enabled;
@@ -104,10 +107,10 @@ class global_data {
     void init_resize_independent_objects();
     void reset_resize_independent_objects();
 
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     void init_gpu();
     void finalize_gpu();
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 
     env_data env_object;
 };
diff --git a/src/common/log/log.cpp b/src/common/log/log.cpp
index a92adb861..2f0ed44ed 100644
--- a/src/common/log/log.cpp
+++ b/src/common/log/log.cpp
@@ -42,7 +42,9 @@ void ccl_logger::write_prefix(std::ostream& str) {
     constexpr size_t tid_width = 5;
     time_t timer;
     char time_buf[time_buf_size]{};
-    struct tm time_info {};
+    struct tm time_info;
+    memset(&time_info, 0, sizeof(time_info));
+
     time(&timer);
     if (localtime_r(&timer, &time_info)) {
         strftime(time_buf, time_buf_size, "%Y:%m:%d-%H:%M:%S", &time_info);
diff --git a/src/common/log/log.hpp b/src/common/log/log.hpp
index c5d7877bf..24c455c5e 100644
--- a/src/common/log/log.hpp
+++ b/src/common/log/log.hpp
@@ -149,7 +149,6 @@ class ccl_logger {
                              std::forward<T>(first),
                              std::forward<Tpackage>(others)...);
 
-        write_backtrace(out_stream);
         std::cerr << streambuf;
         std::flush(std::cerr);
 
diff --git a/src/common/stream/stream.cpp b/src/common/stream/stream.cpp
index 2776d9aa9..77c4b008e 100644
--- a/src/common/stream/stream.cpp
+++ b/src/common/stream/stream.cpp
@@ -17,27 +17,73 @@
 #include "common/log/log.hpp"
 #include "common/stream/stream.hpp"
 #include "common/stream/stream_provider_dispatcher_impl.hpp"
+#include "common/utils/enums.hpp"
+#include "common/utils/sycl_utils.hpp"
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl/backend/level_zero.hpp>
+#endif // CCL_ENABLE_SYCL
+
+namespace ccl {
+std::string to_string(device_family family) {
+    switch (family) {
+        case device_family::family1: return "family1";
+        case device_family::family2: return "family2";
+        default: return "unknown";
+    }
+}
+} // namespace ccl
+
 std::string to_string(const stream_type& type) {
+    using stream_str_enum = utils::enum_to_str<utils::enum_to_underlying(stream_type::last_value)>;
     return stream_str_enum({ "host", "cpu", "gpu" }).choose(type, "unknown");
 }
 
 ccl_stream::ccl_stream(stream_type type,
                        stream_native_t& stream,
                        const ccl::library_version& version)
-        : type(type),
-          version(version) {
+        : version(version),
+          type(type),
+          device_family(ccl::device_family::unknown) {
     native_stream = stream;
 
 #ifdef CCL_ENABLE_SYCL
+    cl::sycl::property_list props{};
+    // TODO: can we somehow simplify this?
+    if (stream.is_in_order()) {
+        if (ccl::global_data::env().enable_kernel_profile) {
+            props = { sycl::property::queue::in_order{},
+                      sycl::property::queue::enable_profiling{} };
+        }
+        else {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+    else {
+        if (ccl::global_data::env().enable_kernel_profile) {
+            props = { sycl::property::queue::enable_profiling{} };
+        }
+        else {
+            props = {};
+        }
+    }
+
     native_streams.resize(ccl::global_data::env().worker_count);
-    for (size_t idx = 0; idx < native_streams.size(); idx++) {
-        native_streams[idx] = stream_native_t(stream.get_context(), stream.get_device());
+    for (auto& native_stream : native_streams) {
+        native_stream = stream_native_t(stream.get_context(), stream.get_device(), props);
     }
 
     backend = stream.get_device().get_backend();
 #endif // CCL_ENABLE_SYCL
+
+#ifdef CCL_ENABLE_ZE
+    if (backend == ccl::utils::get_level_zero_backend()) {
+        device = sycl::get_native<ccl::utils::get_level_zero_backend()>(stream.get_device());
+        context = sycl::get_native<ccl::utils::get_level_zero_backend()>(stream.get_context());
+        device_family = ccl::ze::get_device_family(device);
+    }
+#endif // CCL_ENABLE_ZE
 }
 
 // export attributes
@@ -65,9 +111,45 @@ std::string ccl_stream::to_string() const {
     ss << "{ "
        << "type: " << ::to_string(type) << ", in_order: " << native_stream.is_in_order()
        << ", device: " << native_stream.get_device().get_info<cl::sycl::info::device::name>()
-       << " }";
+       << ", device_family: " << ccl::to_string(device_family) << " }";
 #else // CCL_ENABLE_SYCL
     ss << reinterpret_cast<void*>(native_stream.get());
 #endif // CCL_ENABLE_SYCL
     return ss.str();
 }
+
+stream_type ccl_stream::get_type() const {
+    return type;
+}
+
+ccl::device_family ccl_stream::get_device_family() const {
+    return device_family;
+}
+
+bool ccl_stream::is_sycl_device_stream() const {
+    return (type == stream_type::cpu || type == stream_type::gpu);
+}
+
+bool ccl_stream::is_gpu() const {
+    return type == stream_type::gpu;
+}
+
+#ifdef CCL_ENABLE_SYCL
+cl::sycl::backend ccl_stream::get_backend() const {
+    return backend;
+}
+#ifdef CCL_ENABLE_ZE
+
+ze_device_handle_t ccl_stream::get_ze_device() const {
+    CCL_THROW_IF_NOT(backend == ccl::utils::get_level_zero_backend());
+    CCL_THROW_IF_NOT(device, "no device");
+    return device;
+}
+
+ze_context_handle_t ccl_stream::get_ze_context() const {
+    CCL_THROW_IF_NOT(backend == ccl::utils::get_level_zero_backend());
+    CCL_THROW_IF_NOT(context, "no context");
+    return context;
+}
+#endif // CCL_ENABLE_ZE
+#endif // CCL_ENBALE_SYCL
diff --git a/src/common/stream/stream.hpp b/src/common/stream/stream.hpp
index 7f192715b..4ee9729ee 100644
--- a/src/common/stream/stream.hpp
+++ b/src/common/stream/stream.hpp
@@ -17,28 +17,29 @@
 
 #include "coll/coll_common_attributes.hpp"
 #include "common/stream/stream_provider_dispatcher.hpp"
-#include "common/utils/enums.hpp"
 #include "common/utils/utils.hpp"
-#include "internal_types.hpp"
 #include "oneapi/ccl/stream_attr_ids.hpp"
 #include "oneapi/ccl/stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/types_policy.hpp"
 #include "oneapi/ccl/types.hpp"
 #include "oneapi/ccl/type_traits.hpp"
 
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl/backend_types.hpp>
+#endif // CCL_ENABLE_SYCL
+
 namespace ccl {
-namespace detail {
-class environment;
-}
+
+enum class device_family { unknown, family1, family2 };
+
+std::string to_string(device_family family);
+
 } // namespace ccl
 
-using stream_str_enum = utils::enum_to_str<utils::enum_to_underlying(stream_type::last_value)>;
 std::string to_string(const stream_type& type);
 
 class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
 public:
     friend class stream_provider_dispatcher;
-    friend class ccl::detail::environment;
 
     using stream_native_t = stream_provider_dispatcher::stream_native_t;
 
@@ -48,26 +49,19 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
 
     ~ccl_stream() = default;
 
-    using stream_provider_dispatcher::get_native_stream;
-
     std::string to_string() const;
 
-    stream_type get_type() const {
-        return type;
-    }
-
-    bool is_sycl_device_stream() const {
-        return (type == stream_type::cpu || type == stream_type::gpu);
-    }
-
-    bool is_gpu() const {
-        return type == stream_type::gpu;
-    }
+    stream_type get_type() const;
+    ccl::device_family get_device_family() const;
+    bool is_sycl_device_stream() const;
+    bool is_gpu() const;
 
 #ifdef CCL_ENABLE_SYCL
-    cl::sycl::backend get_backend() const noexcept {
-        return backend;
-    }
+    cl::sycl::backend get_backend() const;
+#ifdef CCL_ENABLE_ZE
+    ze_device_handle_t get_ze_device() const;
+    ze_context_handle_t get_ze_context() const;
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENBALE_SYCL
 
     static std::unique_ptr<ccl_stream> create(stream_native_t& native_stream,
@@ -93,9 +87,17 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
                stream_native_t& native_stream,
                const ccl::library_version& version);
 
+    const ccl::library_version version;
+
     stream_type type;
+    ccl::device_family device_family;
+
 #ifdef CCL_ENABLE_SYCL
     cl::sycl::backend backend;
+
+#ifdef CCL_ENABLE_ZE
+    ze_device_handle_t device{};
+    ze_context_handle_t context{};
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENBALE_SYCL
-    const ccl::library_version version;
 };
diff --git a/src/common/stream/stream_provider_dispatcher.hpp b/src/common/stream/stream_provider_dispatcher.hpp
index cd5b7a028..df4618942 100644
--- a/src/common/stream/stream_provider_dispatcher.hpp
+++ b/src/common/stream/stream_provider_dispatcher.hpp
@@ -14,13 +14,14 @@
  limitations under the License.
 */
 #pragma once
-#ifdef MULTI_GPU_SUPPORT
+
+#ifdef CCL_ENABLE_ZE
 #include <ze_api.h>
-#endif
+#endif // CCL_ENABLE_ZE
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
-#endif
+#endif // CCL_ENABLE_SYCL
 
 #include "oneapi/ccl/type_traits.hpp"
 
@@ -40,27 +41,16 @@ class stream_provider_dispatcher {
 public:
     using stream_native_t = typename ccl::unified_stream_type::ccl_native_t;
 
-    using stream_native_device_t = typename ccl::unified_device_type::ccl_native_t;
-    using stream_native_context_t = typename ccl::unified_context_type::ccl_native_t;
-
     stream_native_t get_native_stream() const;
 
 #ifdef CCL_ENABLE_SYCL
     stream_native_t* get_native_stream(size_t idx);
 #endif // CCL_ENABLE_SYCL
 
-    const stream_native_device_t& get_native_device() const;
-    stream_native_device_t& get_native_device();
-
     static std::unique_ptr<ccl_stream> create(stream_native_t& native_stream,
                                               const ccl::library_version& version);
-    template <class T>
-    using optional = std::pair<bool, T>;
 
 protected:
-    optional<stream_native_device_t> native_device;
-    optional<stream_native_context_t> native_context;
-
     stream_native_t native_stream;
 
 #ifdef CCL_ENABLE_SYCL
diff --git a/src/common/stream/stream_provider_dispatcher_impl.hpp b/src/common/stream/stream_provider_dispatcher_impl.hpp
index bff3c91eb..2c454ec95 100644
--- a/src/common/stream/stream_provider_dispatcher_impl.hpp
+++ b/src/common/stream/stream_provider_dispatcher_impl.hpp
@@ -45,23 +45,10 @@ std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
                 native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
                 std::string("supported types: host, cpu, gpu"));
     }
-
+#endif // CCL_ENABLE_SYCL
     std::unique_ptr<ccl_stream> ret(new ccl_stream(type, native_stream, version));
-    ret->native_device.second = native_stream.get_device();
-    ret->native_device.first = true;
-    ret->native_context.second = native_stream.get_context();
-    ret->native_context.first = true;
-
-    LOG_INFO("SYCL queue type: ",
-             ::to_string(type),
-             ", in_order: ",
-             native_stream.is_in_order(),
-             ", device: ",
-             native_stream.get_device().template get_info<cl::sycl::info::device::name>());
 
-#else // CCL_ENABLE_SYCL
-    std::unique_ptr<ccl_stream> ret(new ccl_stream(type, native_stream, version));
-#endif // CCL_ENABLE_SYCL
+    LOG_INFO("stream: ", ret->to_string());
 
     return ret;
 }
@@ -73,9 +60,6 @@ stream_provider_dispatcher::stream_native_t stream_provider_dispatcher::get_nati
 #ifdef CCL_ENABLE_SYCL
 stream_provider_dispatcher::stream_native_t* stream_provider_dispatcher::get_native_stream(
     size_t idx) {
-    if (idx >= native_streams.size()) {
-        throw ccl::exception("unexpected stream idx");
-    }
-    return &(native_streams[idx]);
+    return &(native_streams.at(idx));
 }
 #endif // CCL_ENABLE_SYCL
diff --git a/src/common/utils/buffer.hpp b/src/common/utils/buffer.hpp
index d35e9aada..095e20245 100644
--- a/src/common/utils/buffer.hpp
+++ b/src/common/utils/buffer.hpp
@@ -68,7 +68,7 @@ class ccl_buffer {
               size(size),
               offset(offset),
               type(type) {
-        LOG_DEBUG("create: src ",
+        LOG_TRACE("create: src ",
                   src,
                   ", size ",
                   size,
@@ -107,7 +107,7 @@ class ccl_buffer {
     }
 
     void set(void* src, ssize_t size, size_t offset, ccl_buffer_type type) {
-        LOG_DEBUG("set: src ",
+        LOG_TRACE("set: src ",
                   src,
                   ", size ",
                   size,
diff --git a/src/common/utils/memcpy.cpp b/src/common/utils/memcpy.cpp
new file mode 100644
index 000000000..a57402c89
--- /dev/null
+++ b/src/common/utils/memcpy.cpp
@@ -0,0 +1,127 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/log/log.hpp"
+#include "common/utils/memcpy.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <immintrin.h>
+
+namespace ccl {
+
+__attribute__((__always_inline__)) inline int is_nts_supported() {
+#ifdef CCL_AVX_COMPILER
+    static int is_avx_enabled = -1;
+    if (is_avx_enabled == -1) {
+        uint32_t reg[4];
+        /* AVX capabilities for NTS implementation */
+        /* CPUID.(EAX=01H):ECX.AVX [bit 28] */
+        __asm__ __volatile__("cpuid"
+                             : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
+                             : "a"(1));
+        is_avx_enabled = ((reg[2] & (1u << 28)) >> 28);
+        LOG_DEBUG("AVX enabled: ", is_avx_enabled);
+    }
+    return is_avx_enabled;
+#else // CCL_AVX_COMPILER
+    LOG_DEBUG("AVX disabled due to compiler");
+    return 0;
+#endif // CCL_AVX_COMPILER
+}
+
+void memcpy(void *dst, const void *src, size_t size) {
+    std::copy((char *)(src), (char *)(src) + (size), (char *)(dst));
+}
+
+void memcpy_nontemporal(void *dst, const void *src, size_t size) {
+    char *d = (char *)dst;
+    const char *s = (const char *)src;
+    size_t n = size;
+
+    if (!is_nts_supported()) {
+        LOG_DEBUG("NTS-based memcpy is requested but not supported, use regular memcpy");
+    }
+
+#ifdef CCL_AVX_COMPILER
+    if ((n <= 256) || !is_nts_supported()) {
+        memcpy(d, s, n);
+        return;
+    }
+
+    if (((uintptr_t)d) & 63) {
+        const uintptr_t t = 64 - (((uintptr_t)d) & 63);
+        memcpy(d, s, t);
+        d += t;
+        s += t;
+        n -= t;
+    }
+
+    while (n >= 256) {
+        __m256i ymm0 = _mm256_loadu_si256((__m256i const *)(s + (32 * 0)));
+        __m256i ymm1 = _mm256_loadu_si256((__m256i const *)(s + (32 * 1)));
+        __m256i ymm2 = _mm256_loadu_si256((__m256i const *)(s + (32 * 2)));
+        __m256i ymm3 = _mm256_loadu_si256((__m256i const *)(s + (32 * 3)));
+        __m256i ymm4 = _mm256_loadu_si256((__m256i const *)(s + (32 * 4)));
+        __m256i ymm5 = _mm256_loadu_si256((__m256i const *)(s + (32 * 5)));
+        __m256i ymm6 = _mm256_loadu_si256((__m256i const *)(s + (32 * 6)));
+        __m256i ymm7 = _mm256_loadu_si256((__m256i const *)(s + (32 * 7)));
+        _mm256_stream_si256((__m256i *)(d + (32 * 0)), ymm0);
+        _mm256_stream_si256((__m256i *)(d + (32 * 1)), ymm1);
+        _mm256_stream_si256((__m256i *)(d + (32 * 2)), ymm2);
+        _mm256_stream_si256((__m256i *)(d + (32 * 3)), ymm3);
+        _mm256_stream_si256((__m256i *)(d + (32 * 4)), ymm4);
+        _mm256_stream_si256((__m256i *)(d + (32 * 5)), ymm5);
+        _mm256_stream_si256((__m256i *)(d + (32 * 6)), ymm6);
+        _mm256_stream_si256((__m256i *)(d + (32 * 7)), ymm7);
+        d += 256;
+        s += 256;
+        n -= 256;
+    }
+
+    if (n & 128) {
+        __m256i ymm0 = _mm256_loadu_si256((__m256i const *)(s + (32 * 0)));
+        __m256i ymm1 = _mm256_loadu_si256((__m256i const *)(s + (32 * 1)));
+        __m256i ymm2 = _mm256_loadu_si256((__m256i const *)(s + (32 * 2)));
+        __m256i ymm3 = _mm256_loadu_si256((__m256i const *)(s + (32 * 3)));
+        _mm256_stream_si256((__m256i *)(d + (32 * 0)), ymm0);
+        _mm256_stream_si256((__m256i *)(d + (32 * 1)), ymm1);
+        _mm256_stream_si256((__m256i *)(d + (32 * 2)), ymm2);
+        _mm256_stream_si256((__m256i *)(d + (32 * 3)), ymm3);
+        d += 128;
+        s += 128;
+    }
+
+    if (n & 64) {
+        __m256i ymm0 = _mm256_loadu_si256((__m256i const *)(s + (32 * 0)));
+        __m256i ymm1 = _mm256_loadu_si256((__m256i const *)(s + (32 * 1)));
+        _mm256_stream_si256((__m256i *)(d + (32 * 0)), ymm0);
+        _mm256_stream_si256((__m256i *)(d + (32 * 1)), ymm1);
+        d += 64;
+        s += 64;
+    }
+
+    if (n & 63) {
+        memcpy(d, s, (n & 63));
+    }
+
+    _mm_sfence();
+
+#else // CCL_AVX_COMPILER
+    memcpy(d, s, n);
+#endif // CCL_AVX_COMPILER
+}
+
+} // namespace ccl
diff --git a/src/common/comm/l0/context_comm_addr.hpp b/src/common/utils/memcpy.hpp
similarity index 71%
rename from src/common/comm/l0/context_comm_addr.hpp
rename to src/common/utils/memcpy.hpp
index b84447930..86f0b8203 100644
--- a/src/common/comm/l0/context_comm_addr.hpp
+++ b/src/common/utils/memcpy.hpp
@@ -14,15 +14,16 @@
  limitations under the License.
 */
 #pragma once
-#include <string>
+
+#include <cstddef>
 
 namespace ccl {
-struct context_comm_addr {
-    size_t thread_idx = 0;
-    size_t thread_count = 0;
-    size_t comm_rank = 0;
-    size_t comm_size = 0;
 
-    std::string to_string() const;
-};
+void memcpy(void* dst, const void* src, size_t size);
+void memcpy_nontemporal(void* dst, const void* src, size_t size)
+#if CCL_AVX_TARGET_ATTRIBUTES
+    __attribute__((target("avx2")))
+#endif // CCL_AVX_TARGET_ATTRIBUTES
+    ;
+
 } // namespace ccl
diff --git a/src/common/utils/sycl_utils.hpp b/src/common/utils/sycl_utils.hpp
index fba47ea42..7c69a7270 100644
--- a/src/common/utils/sycl_utils.hpp
+++ b/src/common/utils/sycl_utils.hpp
@@ -15,13 +15,21 @@
 */
 #pragma once
 
-#ifdef CCL_ENABLE_SYCL
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 
+#include <ze_api.h>
 #include <CL/sycl.hpp>
+#include <CL/sycl/backend/level_zero.hpp>
 
 #include "common/stream/stream.hpp"
 #include "common/global/global.hpp"
 
+#ifdef SYCL_LANGUAGE_VERSION
+#define DPCPP_VERSION __clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__
+#else // SYCL_LANGUAGE_VERSION
+#define DPCPP_VERSION 0
+#endif // SYCL_LANGUAGE_VERSION
+
 namespace ccl {
 namespace utils {
 
@@ -32,7 +40,7 @@ static inline bool is_sycl_event_completed(sycl::event e) {
 
 static inline bool should_use_sycl_output_event(ccl_stream* stream) {
     return (stream && stream->is_sycl_device_stream() && stream->is_gpu() &&
-            ccl::global_data::env().enable_kernel_output_event);
+            ccl::global_data::env().enable_sycl_output_event);
 }
 
 static inline std::string usm_type_to_str(sycl::usm::alloc type) {
@@ -63,7 +71,41 @@ static inline std::string sycl_device_to_str(const sycl::device& dev) {
     }
 }
 
+constexpr sycl::backend get_level_zero_backend() {
+#if DPCPP_VERSION >= 140000
+    return sycl::backend::ext_oneapi_level_zero;
+#elif DPCPP_VERSION < 140000
+    return sycl::backend::level_zero;
+#endif // DPCPP_VERSION
+}
+
+static inline sycl::event submit_barrier(cl::sycl::queue queue) {
+#if DPCPP_VERSION >= 140000
+    return queue.ext_oneapi_submit_barrier();
+#elif DPCPP_VERSION < 140000
+    return queue.submit_barrier();
+#endif // DPCPP_VERSION
+}
+
+static inline sycl::event submit_barrier(cl::sycl::queue queue, sycl::event event) {
+#if DPCPP_VERSION >= 140000
+    return queue.ext_oneapi_submit_barrier({ event });
+#elif DPCPP_VERSION < 140000
+    return queue.submit_barrier({ event });
+#endif // DPCPP_VERSION
+}
+
+static inline sycl::event make_event(sycl::context& context, ze_event_handle_t& sync_event) {
+#if DPCPP_VERSION >= 140000
+    return sycl::make_event<sycl::backend::ext_oneapi_level_zero>(
+        { sync_event, sycl::ext::oneapi::level_zero::ownership::keep }, context);
+#elif DPCPP_VERSION < 140000
+    return sycl::level_zero::make<sycl::event>(
+        context, sync_event, sycl::level_zero::ownership::keep);
+#endif // DPCPP_VERSION
+}
+
 } // namespace utils
 } // namespace ccl
 
-#endif // CCL_ENABLE_SYCL
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index 2d37ce136..6ef7c3ebf 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#if defined(__INTEL_COMPILER) || defined(__ICC)
+#if defined(__INTEL_COMPILER) || defined(__ICC) || defined(__INTEL_LLVM_COMPILER)
 #include <immintrin.h>
 #endif
 
@@ -65,8 +65,6 @@
 #define CCL_LARGE_MSG_ALIGNMENT (2 * 1024 * 1024)
 #define CCL_LARGE_MSG_THRESHOLD (1 * 1024 * 1024)
 
-#define CCL_MEMCPY(dest, src, n) std::copy((char*)(src), (char*)(src) + (n), (char*)(dest))
-
 /* malloc/realloc/free */
 
 #if 0 // defined(__INTEL_COMPILER) || defined(__ICC)
diff --git a/src/common/utils/version.cpp b/src/common/utils/version.cpp
index 554befb87..87c8ee6d2 100644
--- a/src/common/utils/version.cpp
+++ b/src/common/utils/version.cpp
@@ -21,15 +21,10 @@
 namespace utils {
 
 ccl::library_version get_library_version() {
-    ccl::library_version version{};
-
-    version.major = CCL_MAJOR_VERSION;
-    version.minor = CCL_MINOR_VERSION;
-    version.update = CCL_UPDATE_VERSION;
-    version.product_status = CCL_PRODUCT_STATUS;
-    version.build_date = CCL_PRODUCT_BUILD_DATE;
-    version.full = CCL_PRODUCT_FULL;
-    version.cl_backend_name = ccl::backend_traits::name();
+    ccl::library_version version = { CCL_MAJOR_VERSION,          CCL_MINOR_VERSION,
+                                     CCL_UPDATE_VERSION,         CCL_PRODUCT_STATUS,
+                                     CCL_PRODUCT_BUILD_DATE,     CCL_PRODUCT_FULL,
+                                     ccl::backend_traits::name() };
 
     return version;
 }
diff --git a/src/communicator_impl.hpp b/src/communicator_impl.hpp
index 5fbc79c0c..392079b98 100644
--- a/src/communicator_impl.hpp
+++ b/src/communicator_impl.hpp
@@ -14,13 +14,13 @@
  limitations under the License.
 */
 #pragma once
+
 #include "oneapi/ccl/comm_split_attr_ids.hpp"
 #include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
 #include "oneapi/ccl/comm_split_attr.hpp"
 #include "oneapi/ccl/communicator.hpp"
 
 #include "kvs_impl.hpp"
-#include "common/comm/l0/comm_context_id.hpp"
 #include "communicator_impl_details.hpp"
 
 //TODO
@@ -76,39 +76,21 @@ CCL_API vector_class<communicator> communicator::create_communicators(
     shared_ptr_class<kvs_interface> kvs) {
     shared_ptr_class<ikvs_wrapper> kvs_tmp;
     if (std::dynamic_pointer_cast<ccl::v1::kvs>(kvs) != nullptr) {
-        kvs_tmp = std::dynamic_pointer_cast<ccl::v1::kvs>(kvs)->get_impl().get();
+        kvs_tmp = std::dynamic_pointer_cast<v1::kvs>(kvs)->get_impl().get();
     }
     else {
         kvs_tmp = std::shared_ptr<ikvs_wrapper>(new users_kvs(kvs));
     }
-    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(
-        size, devices, context, kvs_tmp);
-#if 0
-    vector_class<int> local_thread_ranks;
-    local_thread_ranks.reserve(devices.size());
-    std::transform(
-        devices.begin(),
-        devices.end(),
-        std::back_inserter(local_thread_ranks),
-        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
-            return val.first;
-        });
-    group_context::comm_group_t thread_group =
-        group_context::instance().group_by_kvs(local_thread_ranks, size, kvs);
 
-    vector_class<DeviceType> local_thread_devices;
-    local_thread_devices.reserve(devices.size());
-    std::transform(
-        devices.begin(),
-        devices.end(),
-        std::back_inserter(local_thread_devices),
-        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
-            return val.second;
-        });
+    CCL_THROW_IF_NOT(devices.size() == 1, "multiple devices per process are not supported");
 
-    auto ret = thread_group->create_communicators(local_thread_devices);
+    LOG_TRACE("create communicator");
+
+    ccl::communicator_interface_ptr impl = ccl::communicator_interface::create_communicator_impl(
+        size, devices.begin()->first, kvs_tmp);
+    ccl::vector_class<ccl::communicator> ret;
+    ret.push_back(ccl::communicator(std::move(impl)));
     return ret;
-#endif
 }
 
 template <class DeviceType, class ContextType>
@@ -116,49 +98,14 @@ CCL_API vector_class<communicator> communicator::create_communicators(
     const int size,
     const map_class<int, DeviceType>& devices,
     const ContextType& context,
-    shared_ptr_class<kvs_interface> kvs)
-
-{
-    shared_ptr_class<ikvs_wrapper> kvs_tmp;
-    if (std::dynamic_pointer_cast<ccl::v1::kvs>(kvs) != nullptr) {
-        kvs_tmp = std::dynamic_pointer_cast<v1::kvs>(kvs)->get_impl().get();
-    }
-    else {
-        kvs_tmp = std::shared_ptr<ikvs_wrapper>(new users_kvs(kvs));
+    shared_ptr_class<kvs_interface> kvs) {
+    std::vector<pair_class<int, DeviceType>> vec_devices;
+    for (const auto& d : devices) {
+        vec_devices.push_back(std::make_pair(d.first, d.second));
     }
-    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(
-        size, devices, context, kvs_tmp);
-#if 0
-    vector_class<int> local_thread_ranks;
-    local_thread_ranks.reserve(devices.size());
-    std::transform(devices.begin(),
-                   devices.end(),
-                   std::back_inserter(local_thread_ranks),
-                   [](const typename map_class<int, DeviceType>::value_type& val) {
-                       return val.first;
-                   });
-    group_context::comm_group_t thread_group =
-        group_context::instance().group_by_kvs(local_thread_ranks, size, kvs);
-
-    vector_class<DeviceType> local_thread_devices;
-    local_thread_devices.reserve(devices.size());
-    std::transform(devices.begin(),
-                   devices.end(),
-                   std::back_inserter(local_thread_devices),
-                   [](const typename map_class<int, DeviceType>::value_type& val) {
-                       return val.second;
-                   });
-
-    auto ret = thread_group->create_communicators(local_thread_devices);
-    return ret;
-#endif
+    return create_communicators(size, vec_devices, context, kvs);
 }
 
-/*CCL_API bool communicator::is_ready() const
-{
-    return get_impl()->is_ready();
-}*/
-
 /**
  * Creates a new host communicator with externally provided size, rank and kvs.
  * Implementation is platform specific and non portable.
diff --git a/src/communicator_impl_details.hpp b/src/communicator_impl_details.hpp
index 8953148b3..3bee3e82b 100644
--- a/src/communicator_impl_details.hpp
+++ b/src/communicator_impl_details.hpp
@@ -62,40 +62,12 @@ struct comm_impl_base_dispatch {
         //     throw ccl::unimplemented("API", "create_communicators", "for multiple devices");
         // }
     }
-
-    template <class DeviceType, class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        map_class<int, DeviceType> converted_map;
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::inserter(converted_map, converted_map.end()),
-                       [](const pair_class<int, DeviceType>& val) {
-                           return std::make_pair(val.first, val.second);
-                       });
-        if (local_rank_device_map.size() != converted_map.size()) {
-            std::stringstream ss;
-            ss << "found duplicated ranks in `local_rank_device_map`:\n";
-            for (const auto& v : local_rank_device_map) {
-                ss << std::to_string(v.first) << ", ";
-            }
-            throw ccl::invalid_argument("API", "create_communicators", ss.str());
-        }
-        return impl::template create_communicators_selector(
-            cluster_devices_size,
-            converted_map,
-            context_extractor<ContextType>::extract(context),
-            kvs);
-    }
 };
 
 template <cl_backend_type type>
 struct comm_impl_dispatch_selector {};
 
-#if !defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
+#if !defined(CCL_ENABLE_SYCL) and !defined(CCL_ENABLE_ZE)
 template <>
 struct comm_impl_dispatch_selector<cl_backend_type::empty_backend>
         : public comm_impl_base_dispatch<
@@ -103,19 +75,6 @@ struct comm_impl_dispatch_selector<cl_backend_type::empty_backend>
     using base_t =
         comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::empty_backend>>;
 
-    template <class DeviceType, class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        return base_t::template create_communicators_selector<DeviceType>(
-            cluster_devices_size,
-            local_rank_device_map,
-            context_extractor<ContextType>::extract(context),
-            kvs);
-    }
-
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
         const size_t cluster_devices_size, /*global devices count*/
@@ -143,67 +102,13 @@ struct comm_impl_dispatch_selector<cl_backend_type::empty_backend>
 };
 #endif
 
-#if defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) and !defined(CCL_ENABLE_ZE)
 template <>
 struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>
         : public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>> {
     using base_t =
         comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>>;
 
-    template <class DeviceType, class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        return base_t::template create_communicators_selector<DeviceType>(
-            cluster_devices_size,
-            local_rank_device_map,
-            context_extractor<ContextType>::extract(context),
-            kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device_index_type>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
-
-        map_class<int, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
-                           return std::make_pair(val.first,
-                                                 ccl::unified_device_type{ val.second }.get());
-                       });
-        return create_communicators_selector(cluster_devices_size,
-                                             converted_device_map,
-                                             context_extractor<ContextType>::extract(context),
-                                             kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        map_class<int, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<int, ccl::device>::value_type& val) {
-                           return std::make_pair(val.first, val.second.get_native());
-                       });
-        return create_communicators_selector(cluster_devices_size,
-                                             converted_device_map,
-                                             context_extractor<ContextType>::extract(context),
-                                             kvs);
-    }
-
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
         const size_t cluster_devices_size, /*global devices count*/
@@ -287,8 +192,8 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>
             "Create single device communicator from SYCL device (sycl and !mgpu), after find_if rank ",
             rank);
 
-        std::shared_ptr<atl_wrapper> atl =
-            std::shared_ptr<atl_wrapper>(new atl_wrapper(cluster_devices_size, { rank }, kvs));
+        std::shared_ptr<atl_base_comm> atl =
+            atl_comm_manager::create_atl_comm(cluster_devices_size, { rank }, kvs);
 
         ccl::communicator_interface_ptr impl =
             ccl::communicator_interface::create_communicator_impl(device,
@@ -302,241 +207,25 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>
         ccl::vector_class<ccl::communicator> ret;
         ret.push_back(ccl::communicator(std::move(impl)));
         return ret;
-
-        /*
-    //collect ranks
-    vector_class<int> local_thread_ranks;
-    local_thread_ranks.reserve(local_rank_device_map.size());
-    std::transform(
-        local_rank_device_map.begin(),
-        local_rank_device_map.end(),
-        std::back_inserter(local_thread_ranks),
-        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
-            return val.first;
-        });
-
-    vector_class<DeviceType> local_thread_devices;
-    local_thread_devices.reserve(local_rank_device_map.size());
-    std::transform(
-        local_rank_device_map.begin(),
-        local_rank_device_map.end(),
-        std::back_inserter(local_thread_devices),
-        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
-            return val.second;
-        });
-    if ()
-    auto ret = thread_group->create_communicators_group(local_thread_devices);
-    return ret;
-    return {};
-    */
     }
 };
-#endif //defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
+#endif //defined(CCL_ENABLE_SYCL) and !defined(CCL_ENABLE_ZE)
 
-#if defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) and !defined(CCL_ENABLE_SYCL)
 template <>
 struct comm_impl_dispatch_selector<cl_backend_type::l0>
         : public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::l0>> {
     using base_t = comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::l0>>;
-
-    template <class DeviceType, class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        return base_t::template create_communicators_selector<DeviceType>(
-            cluster_devices_size,
-            local_rank_device_map,
-            context_extractor<ContextType>::extract(context),
-            kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        map_class<int, ccl::device_index_type> converted_device_map;
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<int, ccl::device>::value_type& val) {
-                           return std::make_pair(val.first,
-                                                 val.second.get_native()->get_device_path());
-                       });
-        return create_communicators_selector(cluster_devices_size,
-                                             converted_device_map,
-                                             context_extractor<ContextType>::extract(context),
-                                             kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, typename unified_device_type::ccl_native_t>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        map_class<int, ccl::device_index_type> converted_device_map;
-        std::transform(
-            local_rank_device_map.begin(),
-            local_rank_device_map.end(),
-            std::inserter(converted_device_map, converted_device_map.end()),
-            [](const typename map_class<int, typename unified_device_type::ccl_native_t>::
-                   value_type& val) {
-                return std::make_pair(val.first, val.second->get_device_path());
-            });
-        return create_communicators_selector(cluster_devices_size,
-                                             converted_device_map,
-                                             context_extractor<ContextType>::extract(context),
-                                             kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device_index_type>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
-
-        //collect ranks
-        vector_class<int> local_thread_ranks;
-        local_thread_ranks.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_ranks),
-                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
-                           return val.first;
-                       });
-        group_context::comm_group_t thread_group =
-            group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
-
-        vector_class<ccl::device_index_type> local_thread_devices;
-        local_thread_devices.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_devices),
-                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
-                           return val.second;
-                       });
-
-        auto ret = thread_group->create_communicators_group(
-            local_thread_devices, context_extractor<ContextType>::extract(context));
-        return ret;
-    }
 };
-#endif //defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
+#endif //defined(CCL_ENABLE_ZE) and !defined(CCL_ENABLE_SYCL)
 
-#if defined(MULTI_GPU_SUPPORT) and defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) and defined(CCL_ENABLE_SYCL)
 template <>
 struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>
         : public comm_impl_base_dispatch<
               comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>> {
     using base_t =
         comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>>;
-
-    template <class DeviceType, class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        return base_t::template create_communicators_selector<DeviceType>(
-            cluster_devices_size,
-            local_rank_device_map,
-            context_extractor<ContextType>::extract(context),
-            kvs);
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        map_class<int, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<int, ccl::device>::value_type& val) {
-                           return std::make_pair(val.first, val.second.get_native());
-                       });
-        return create_communicators_selector(cluster_devices_size,
-                                             converted_device_map,
-                                             context_extractor<ContextType>::extract(context),
-                                             kvs);
-    }
-
-    // TODO: try to combine these 2 overload below
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, cl::sycl::device>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
-
-        //collect ranks
-        vector_class<int> local_thread_ranks;
-        local_thread_ranks.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_ranks),
-                       [](const typename map_class<int, cl::sycl::device>::value_type& val) {
-                           return val.first;
-                       });
-        group_context::comm_group_t thread_group =
-            group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
-
-        vector_class<cl::sycl::device> local_thread_devices;
-        local_thread_devices.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_devices),
-                       [](const typename map_class<int, cl::sycl::device>::value_type& val) {
-                           return val.second;
-                       });
-
-        auto ret = thread_group->create_communicators_group(
-            local_thread_devices, context_extractor<ContextType>::extract(context));
-        return ret;
-    }
-
-    template <class ContextType>
-    static vector_class<communicator> create_communicators_selector(
-        const size_t cluster_devices_size, /*global devices count*/
-        const map_class<int, ccl::device_index_type>& local_rank_device_map,
-        const ContextType& context,
-        shared_ptr_class<ikvs_wrapper> kvs) {
-        base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
-
-        //collect ranks
-        vector_class<int> local_thread_ranks;
-        local_thread_ranks.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_ranks),
-                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
-                           return val.first;
-                       });
-        group_context::comm_group_t thread_group =
-            group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
-
-        vector_class<ccl::device_index_type> local_thread_devices;
-        local_thread_devices.reserve(local_rank_device_map.size());
-        std::transform(local_rank_device_map.begin(),
-                       local_rank_device_map.end(),
-                       std::back_inserter(local_thread_devices),
-                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
-                           return val.second;
-                       });
-
-        auto ret = thread_group->create_communicators_group(
-            local_thread_devices, context_extractor<ContextType>::extract(context));
-        return ret;
-    }
 };
-#endif //defined(MULTI_GPU_SUPPORT) and defined(CCL_ENABLE_SYCL)
+#endif //defined(CCL_ENABLE_ZE) and defined(CCL_ENABLE_SYCL)
 } // namespace ccl
diff --git a/src/comp/bf16/bf16_utils.hpp b/src/comp/bf16/bf16_utils.hpp
index 212b74491..06d845d68 100644
--- a/src/comp/bf16/bf16_utils.hpp
+++ b/src/comp/bf16/bf16_utils.hpp
@@ -49,8 +49,8 @@ __attribute__((__always_inline__)) inline std::set<ccl_bf16_impl_type> ccl_bf16_
     __asm__ __volatile__("cpuid"
                          : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                          : "a"(7), "c"(0));
-    is_avx512f_enabled =
-        ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31);
+    is_avx512f_enabled = ((reg[1] & (1u << 16)) >> 16) & ((reg[1] & (1u << 30)) >> 30) &
+                         ((reg[1] & (1u << 31)) >> 31);
 
 #ifdef CCL_BF16_AVX512BF_COMPILER
     /* capabilities for optimized BF16/FP32 conversions */
diff --git a/src/comp/comp.cpp b/src/comp/comp.cpp
index 1d9f8645e..b8976e970 100644
--- a/src/comp/comp.cpp
+++ b/src/comp/comp.cpp
@@ -19,6 +19,7 @@
 #include "common/log/log.hpp"
 #include "common/global/global.hpp"
 #include "common/utils/enums.hpp"
+#include "common/utils/memcpy.hpp"
 #include "common/utils/sycl_utils.hpp"
 #include "oneapi/ccl/types.hpp"
 #include "sched/queue/queue.hpp"
@@ -56,13 +57,15 @@
         } \
     } while (0)
 
-ccl::status ccl_comp_copy(const void* in_buf,
-                          void* out_buf,
-                          size_t count,
-                          const ccl_datatype& dtype) {
+ccl::status ccl_comp_copy(const void* in_buf, void* out_buf, size_t bytes, bool use_nontemporal) {
     CCL_ASSERT(in_buf, "in_buf is null");
     CCL_ASSERT(out_buf, "out_buf is null");
-    CCL_MEMCPY(out_buf, in_buf, count * dtype.size());
+    if (use_nontemporal) {
+        ccl::memcpy_nontemporal(out_buf, in_buf, bytes);
+    }
+    else {
+        ccl::memcpy(out_buf, in_buf, bytes);
+    }
     return ccl::status::success;
 }
 
@@ -147,13 +150,16 @@ ccl::status ccl_comp_reduce(ccl_sched* sched,
     void* host_inout_buf = inout_buf;
     size_t bytes = in_count * dtype.size();
 
+    ccl::alloc_param alloc_param(bytes, ccl::buffer_type::sycl, ccl::buffer_place::host, false);
+    ccl::dealloc_param dealloc_param(nullptr, bytes, ccl::buffer_type::sycl);
+
     if (in_ptr_type == sycl::usm::alloc::device) {
-        host_in_buf = sched->alloc_buffer_unmanaged(bytes, ccl_sched_buf_runtime);
+        host_in_buf = sched->alloc_buffer(alloc_param).get_ptr();
         q->memcpy(host_in_buf, in_buf, bytes).wait();
     }
 
     if (inout_ptr_type == sycl::usm::alloc::device) {
-        host_inout_buf = sched->alloc_buffer_unmanaged(bytes, ccl_sched_buf_runtime);
+        host_inout_buf = sched->alloc_buffer(alloc_param).get_ptr();
         q->memcpy(host_inout_buf, inout_buf, bytes).wait();
     }
 
@@ -161,12 +167,14 @@ ccl::status ccl_comp_reduce(ccl_sched* sched,
         host_in_buf, in_count, host_inout_buf, out_count, dtype, reduction, reduction_fn, context);
 
     if (host_in_buf != in_buf) {
-        sched->free_buffer_unmanaged(host_in_buf, bytes, ccl_sched_buf_runtime);
+        dealloc_param.ptr = host_in_buf;
+        sched->dealloc_buffer(dealloc_param);
     }
 
     if (host_inout_buf != inout_buf) {
         q->memcpy(inout_buf, host_inout_buf, bytes).wait();
-        sched->free_buffer_unmanaged(host_inout_buf, bytes, ccl_sched_buf_runtime);
+        dealloc_param.ptr = host_inout_buf;
+        sched->dealloc_buffer(dealloc_param);
     }
 
     return ccl::status::success;
diff --git a/src/comp/comp.hpp b/src/comp/comp.hpp
index fd9124d51..7867bfda9 100644
--- a/src/comp/comp.hpp
+++ b/src/comp/comp.hpp
@@ -23,7 +23,8 @@
 ccl::status ccl_comp_copy(const void* in_buf,
                           void* out_buf,
                           size_t count,
-                          const ccl_datatype& dtype);
+                          bool use_nontemporal = false);
+
 ccl::status ccl_comp_reduce(ccl_sched* sched,
                             const void* in_buf,
                             size_t in_count,
@@ -33,6 +34,7 @@ ccl::status ccl_comp_reduce(ccl_sched* sched,
                             ccl::reduction reduction,
                             ccl::reduction_fn reduction_fn,
                             const ccl::fn_context* context = nullptr);
+
 ccl::status ccl_comp_batch_reduce(const void* in_buf,
                                   const std::vector<size_t>& offsets,
                                   size_t in_count,
@@ -45,4 +47,5 @@ ccl::status ccl_comp_batch_reduce(const void* in_buf,
                                   int bf16_keep_precision_mode,
                                   float* tmp,
                                   float* acc);
+
 const char* ccl_reduction_to_str(ccl::reduction type);
diff --git a/src/comp/fp16/fp16_utils.hpp b/src/comp/fp16/fp16_utils.hpp
index 315060c80..0aefc8c8a 100644
--- a/src/comp/fp16/fp16_utils.hpp
+++ b/src/comp/fp16/fp16_utils.hpp
@@ -42,7 +42,8 @@ __attribute__((__always_inline__)) inline std::set<ccl_fp16_impl_type> ccl_fp16_
 
     uint32_t reg[4];
 
-    /* AVX2 capabilities for FP16 implementation */
+    /* F16C capabilities for FP16 implementation */
+    /* CPUID.(EAX=01H):ECX.AVX [bit 29] */
     __asm__ __volatile__("cpuid" : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "a"(1));
     is_f16c_enabled = (reg[2] & (1 << 29)) >> 29;
 
@@ -53,8 +54,8 @@ __attribute__((__always_inline__)) inline std::set<ccl_fp16_impl_type> ccl_fp16_
     __asm__ __volatile__("cpuid"
                          : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                          : "a"(7), "c"(0));
-    is_avx512f_enabled =
-        ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) & ((reg[1] & (1 << 31)) >> 31);
+    is_avx512f_enabled = ((reg[1] & (1u << 16)) >> 16) & ((reg[1] & (1u << 30)) >> 30) &
+                         ((reg[1] & (1u << 31)) >> 31);
 
     if (is_avx512f_enabled)
         result.insert(ccl_fp16_avx512f);
diff --git a/src/environment_impl.hpp b/src/environment_impl.hpp
index 2d117c714..30b03bb95 100644
--- a/src/environment_impl.hpp
+++ b/src/environment_impl.hpp
@@ -26,8 +26,6 @@
 #include "common/comm/comm_split_common_attr.hpp"
 #include "comm_split_attr_impl.hpp"
 
-#include "common/comm/l0/comm_context_storage.hpp"
-
 #include "stream_impl.hpp"
 
 #include "common/global/global.hpp"
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 2dc64ed0c..070835227 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -57,6 +57,7 @@ atl_attr_t ccl_executor::generate_atl_attr(const ccl::env_data& env) {
     attr.in.mnic_type = env.mnic_type;
     attr.in.mnic_name = env.mnic_name_raw;
     attr.in.mnic_count = env.mnic_count;
+    attr.in.mnic_offset = env.mnic_offset;
 
     memset(&attr.out, 0, sizeof(attr.out));
 
@@ -79,15 +80,15 @@ ccl_executor::ccl_executor(const char* main_addr) {
                             : &ccl_executor::get_worker_idx_round_robin;
 
     /* generate ATL attr for all future communicators */
-    atl_wrapper::attr = generate_atl_attr(env);
-    atl_wrapper::set_exec(this);
+    atl_comm_manager::set_internal_env(generate_atl_attr(env));
+    atl_comm_manager::set_exec(this);
 }
 
 void ccl_executor::start_workers(int proc_idx, int proc_count) {
     set_local_coord(proc_idx, proc_count);
     auto& env = ccl::global_data::env();
     CCL_THROW_IF_NOT(env.env_2_worker_affinity(get_local_proc_idx(), get_local_proc_count()));
-    CCL_THROW_IF_NOT(env.env_2_worker_mem_affinity());
+    CCL_THROW_IF_NOT(env.env_2_worker_mem_affinity(get_local_proc_count()));
     start_workers();
 }
 
@@ -217,7 +218,7 @@ void ccl_executor::update_workers() {
 //    }
 //
 //    if (resize_func != NULL)
-//        ccl::global_data::get().atl->atl_set_resize_function((atl_resize_fn_t)resize_func);
+//        ccl::global_data::get().atl->set_resize_function((atl_resize_fn_t)resize_func);
 //
 //    /* pin listener thread together with first worker thread */
 //    auto worker_affinity = ccl::global_data::env().worker_affinity;
@@ -234,7 +235,7 @@ void ccl_executor::update_workers() {
 //}
 
 void ccl_executor::start(ccl_extra_sched* extra_sched) {
-    CCL_ASSERT(extra_sched->internal_type == ccl_sched_internal_unordered_coll,
+    CCL_ASSERT(extra_sched->sched_type == ccl_sched_unordered_coll,
                "should be unordered_coll for now");
 
     extra_sched->set_counter(1);
diff --git a/src/exec/exec.hpp b/src/exec/exec.hpp
index 43dbe9c61..c82350e45 100644
--- a/src/exec/exec.hpp
+++ b/src/exec/exec.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "atl/atl.h"
+#include "atl/atl_base_comm.hpp"
 #include "coll/coll.hpp"
 #include "common/global/global.hpp"
 #include "common/request/request.hpp"
diff --git a/src/exec/thread/listener.cpp b/src/exec/thread/listener.cpp
index 3c5093860..7bf725764 100644
--- a/src/exec/thread/listener.cpp
+++ b/src/exec/thread/listener.cpp
@@ -31,13 +31,13 @@
 //
 //    while (true) {
 //        /*
-//         * atl_wait_notification return values:
+//         * wait_notification return values:
 //         * 0 - got notification, should do some updates
 //         * 1 - finished by timeout, should check whether thread should be stopped
 //                                    in another case should recall this function
 //         * TODO: replace numbers by enum values
 //         * */
-//        res = global_data.atl->atl_wait_notification();
+//        res = global_data.atl->wait_notification();
 //
 //        if (res == 1) {
 //            if (listener->should_stop.load(std::memory_order_acquire))
@@ -49,7 +49,7 @@
 //        ccl_executor::worker_guard guard = global_data.executor->get_worker_lock();
 //
 //        global_data.reset_resize_dependent_objects();
-//        global_data.atl->atl_update();
+//        global_data.atl->update();
 //        global_data.init_resize_dependent_objects();
 //
 //        global_data.executor->update_workers();
diff --git a/src/exec/thread/worker.cpp b/src/exec/thread/worker.cpp
index e39bfcee4..f3c645242 100644
--- a/src/exec/thread/worker.cpp
+++ b/src/exec/thread/worker.cpp
@@ -174,7 +174,7 @@ ccl::status ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed_
         for (size_t sched_idx = 0; sched_idx < 1; sched_idx++) {
             ccl_sched* sched = bin->get(sched_idx);
             ccl_comm* comm = sched->coll_param.comm;
-            atl_status_t atl_status = comm->atl->atl_ep_poll(bin->get_atl_ep());
+            atl_status_t atl_status = comm->get_atl_comm()->poll(bin->get_atl_ep());
             CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status);
         }
     }
diff --git a/src/fusion/fusion.cpp b/src/fusion/fusion.cpp
index d555b90bf..85cadee4d 100644
--- a/src/fusion/fusion.cpp
+++ b/src/fusion/fusion.cpp
@@ -15,7 +15,7 @@
 */
 #include "exec/exec.hpp"
 #include "fusion/fusion.hpp"
-#include "sched/buffer_cache.hpp"
+#include "sched/buffer/buffer_cache.hpp"
 #include "sched/cache/cache.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
@@ -95,7 +95,7 @@ void ccl_fusion_manager::reset() {
 }
 
 bool ccl_fusion_manager::can_fuse(ccl_master_sched* sched) {
-    if (atl_wrapper::attr.out.enable_hmem) {
+    if (atl_base_comm::attr.out.enable_hmem) {
         /* TODO: implement fusion with D2D copies */
         return false;
     }
@@ -197,8 +197,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
                                                                                    coll_attr,
                                                                                    comm,
                                                                                    stream);
-                sched = new ccl_master_sched(coll_param);
-                sched->internal_type = ccl_sched_internal_fusion;
+                sched = new ccl_master_sched({ ccl_sched_fusion, coll_param });
             } break;
             default: CCL_FATAL("not supported"); break;
         }
@@ -290,7 +289,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
             size_t global_copy_idx = idx * copies_per_part + copy_idx;
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<copy_entry>(
+                entry_factory::create<copy_entry>(
                     part_scheds[idx].get(),
                     ccl_buffer(
                         exec_queue[global_copy_idx]->coll_param.get_send_buf_ptr(
@@ -303,7 +302,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
                     copy_attr(copy_direction::d2h));
             else
 #endif // CCL_ENABLE_SYCL
-                entry_factory::make_entry<copy_entry>(
+                entry_factory::create<copy_entry>(
                     part_scheds[idx].get(),
                     ccl_buffer(
                         exec_queue[global_copy_idx]->coll_param.get_send_buf_ptr(),
@@ -330,7 +329,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
             size_t global_copy_idx = idx * copies_per_part + copy_idx;
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<copy_entry>(
+                entry_factory::create<copy_entry>(
                     part_scheds[idx].get(),
                     ccl_buffer(fusion_buf, buffer_size, offset),
                     ccl_buffer(
@@ -343,7 +342,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
                     copy_attr(copy_direction::h2d));
             else
 #endif // CCL_ENABLE_SYCL
-                entry_factory::make_entry<copy_entry>(
+                entry_factory::create<copy_entry>(
                     part_scheds[idx].get(),
                     ccl_buffer(fusion_buf, buffer_size, offset),
                     ccl_buffer(
@@ -356,7 +355,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
             part_scheds[idx]->add_barrier();
 
             offset += exec_queue[global_copy_idx]->coll_param.get_recv_count() * dtype_size;
-            entry_factory::make_entry<function_entry>(
+            entry_factory::create<function_entry>(
                 part_scheds[idx].get(), complete_user_request, exec_queue[global_copy_idx]);
             CCL_THROW_IF_NOT(!exec_queue[global_copy_idx]->is_completed(),
                              "incorrect completion counter");
@@ -369,8 +368,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
         part_scheds[0]->set_finalize_fn(release_fusion_buf_for_cached_sched, fusion_buf);
     }
     else {
-        entry_factory::make_entry<function_entry>(
-            part_scheds[0].get(), release_fusion_buf, fusion_buf);
+        entry_factory::create<function_entry>(part_scheds[0].get(), release_fusion_buf, fusion_buf);
     }
 
     clear_exec_queue();
@@ -420,7 +418,7 @@ void ccl_fusion_manager::execute() {
 
             for (auto it = postponed_queue.begin(); it != postponed_queue.end();) {
                 auto s = *it;
-                if (s->coll_param.dtype.idx() == first_sched->coll_param.dtype.idx() &&
+                if (s->coll_param.dtype == first_sched->coll_param.dtype &&
                     s->coll_param.comm == first_sched->coll_param.comm &&
                     s->coll_param.ctype == first_sched->coll_param.ctype &&
                     s->coll_param.reduction == first_sched->coll_param.reduction &&
diff --git a/src/hwloc/hwloc_wrapper.cpp b/src/hwloc/hwloc_wrapper.cpp
index 514d5bc94..b37ce9c60 100644
--- a/src/hwloc/hwloc_wrapper.cpp
+++ b/src/hwloc/hwloc_wrapper.cpp
@@ -128,11 +128,11 @@ std::string ccl_hwloc_wrapper::to_string() {
         ss << "{\n";
         ss << "  membind_thread_supported: " << membind_thread_supported << "\n";
         for (auto& node : numa_nodes) {
-            ss << "  numa: {"
+            ss << "  numa: { "
                << "idx: " << node.idx << ", os idx: " << node.os_idx
                << ", memory: " << node.mem_in_mb << " MB"
                << ", cores: " << node.core_count << ", cpus: " << node.cpus.size()
-               << ", membind: " << node.membind_support << "}\n";
+               << ", membind: " << node.membind_support << " }\n";
         }
         ss << "}";
     }
@@ -159,6 +159,7 @@ bool ccl_hwloc_wrapper::is_dev_close_by_pci(int domain, int bus, int dev, int fu
     CCL_THROW_IF_NOT(first_non_io);
 
     LOG_DEBUG("first_non_io object: ", obj_to_string(first_non_io));
+    LOG_DEBUG("pci info: [", domain, ":", bus, ":", dev, ":", func, "]");
 
     /* determine if PCI device is "close" to process by checking if process's affinity is included
      * in PCI device's affinity or if PCI device's affinity is included in process's affinity */
diff --git a/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp b/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
index ebf887327..4007ec93d 100644
--- a/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
+++ b/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
@@ -15,13 +15,14 @@
 */
 #pragma once
 
-#if defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_ZE)
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 #include "oneapi/ccl/native_device_api/l0/declarations.hpp"
 #include "oneapi/ccl/type_traits.hpp"
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl/backend/level_zero.hpp>
+#include "common/utils/sycl_utils.hpp"
 //static cl::sycl::vector_class<cl::sycl::device> gpu_sycl_devices;
 #endif
 
@@ -69,7 +70,7 @@ template <class ContextType>
     static_assert(
         std::is_same<typename std::remove_cv<ContextType>::type, cl::sycl::context>::value,
         "Invalid ContextType");
-    auto l0_handle_ptr = ctx.template get_native<cl::sycl::backend::level_zero>();
+    auto l0_handle_ptr = sycl::get_native<ccl::utils::get_level_zero_backend()>(ctx);
     if (!l0_handle_ptr) {
         CCL_THROW("failed for sycl context: handle is nullptr");
     }
@@ -93,4 +94,4 @@ template native::ccl_device_driver::device_ptr native::get_runtime_device(
     const cl::sycl::device& device);
 #endif
 
-#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#endif //#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
diff --git a/src/native_device_api/empty/export.cpp b/src/native_device_api/empty/export.cpp
index a2fab125e..648c6cf9b 100644
--- a/src/native_device_api/empty/export.cpp
+++ b/src/native_device_api/empty/export.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl/config.h"
-#if !defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
+#if !defined(CCL_ENABLE_ZE) and !defined(CCL_ENABLE_SYCL)
 
 #include "oneapi/ccl/native_device_api/empty/export.hpp"
 #include "oneapi/ccl/type_traits.hpp"
@@ -36,4 +36,4 @@ generic_context_type<cl_backend_type::empty_backend>::get() const noexcept {
 }
 } // namespace ccl
 
-#endif //#if !defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
+#endif //#if !defined(CCL_ENABLE_ZE) and !defined(CCL_ENABLE_SYCL)
diff --git a/src/native_device_api/interop_utils.cpp b/src/native_device_api/interop_utils.cpp
index cba21295b..c1ba658d8 100644
--- a/src/native_device_api/interop_utils.cpp
+++ b/src/native_device_api/interop_utils.cpp
@@ -17,18 +17,19 @@
 #include "common/log/log.hpp"
 #include "common/utils/enums.hpp"
 
-#if defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_ZE)
 #include "oneapi/ccl/native_device_api/l0/primitives.hpp"
 #endif
 
-#if defined(MULTI_GPU_SUPPORT) && defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 #include <CL/sycl/backend/level_zero.hpp>
 #include "oneapi/ccl/native_device_api/l0/primitives.hpp"
+#include "common/utils/sycl_utils.hpp"
 #endif
 
 namespace native {
 namespace detail {
-#if defined(MULTI_GPU_SUPPORT) && defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 
 size_t get_sycl_device_id(const cl::sycl::device& device) {
     if (!device.is_gpu()) {
@@ -37,7 +38,7 @@ size_t get_sycl_device_id(const cl::sycl::device& device) {
     size_t device_id = std::numeric_limits<size_t>::max();
     try {
         // extract native handle L0
-        auto l0_handle = device.template get_native<cl::sycl::backend::level_zero>();
+        auto l0_handle = sycl::get_native<ccl::utils::get_level_zero_backend()>(device);
 
         ze_device_properties_t device_properties;
         ze_result_t ret = zeDeviceGetProperties(l0_handle, &device_properties);
@@ -61,7 +62,7 @@ size_t get_sycl_subdevice_id(const cl::sycl::device& device) {
     size_t subdevice_id = std::numeric_limits<size_t>::max();
     try {
         // extract native handle L0
-        auto l0_handle = device.template get_native<cl::sycl::backend::level_zero>();
+        auto l0_handle = sycl::get_native<ccl::utils::get_level_zero_backend()>(device);
 
         ze_device_properties_t device_properties;
         ze_result_t ret = zeDeviceGetProperties(l0_handle, &device_properties);
diff --git a/src/native_device_api/l0/export.cpp b/src/native_device_api/l0/export.cpp
index 50c5f101f..4d2c4ee06 100644
--- a/src/native_device_api/l0/export.cpp
+++ b/src/native_device_api/l0/export.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl/config.h"
-#if defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
+#if defined(CCL_ENABLE_ZE) and !defined(CCL_ENABLE_SYCL)
 
 #include "oneapi/ccl/native_device_api/l0/export.hpp"
 #include "common/log/log.hpp"
@@ -112,4 +112,4 @@ generic_platform_type<cl_backend_type::l0>::get() const noexcept {
     return native::get_platform();
 }
 } // namespace ccl
-#endif //MULTI_GPU_SUPPORT
+#endif //CCL_ENABLE_ZE
diff --git a/src/native_device_api/l0/utils.cpp b/src/native_device_api/l0/utils.cpp
index e5f2b6d6d..8878a2e30 100644
--- a/src/native_device_api/l0/utils.cpp
+++ b/src/native_device_api/l0/utils.cpp
@@ -16,16 +16,10 @@
 #include "oneapi/ccl/native_device_api/l0/utils.hpp"
 #include "common/log/log.hpp"
 
-#if defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_ZE)
 #include "oneapi/ccl/native_device_api/l0/device.hpp"
 #include "oneapi/ccl/native_device_api/l0/context.hpp"
 
-#if defined(CCL_ENABLE_SYCL)
-//#ifdef CCL_ENABLE_SYCL
-#include <CL/sycl/backend/level_zero.hpp>
-//static cl::sycl::vector_class<cl::sycl::device> gpu_sycl_devices;
-#endif
-
 namespace native {
 namespace detail {
 
@@ -103,4 +97,4 @@ ccl::device_index_type deserialize_device_path(const uint8_t** data, size_t& siz
 
 } // namespace detail
 } // namespace native
-#endif //#if defined(MULTI_GPU_SUPPORT)
+#endif //#if defined(CCL_ENABLE_ZE)
diff --git a/src/native_device_api/sycl/export.cpp b/src/native_device_api/sycl/export.cpp
index 7c32c47e7..9ae90b75e 100644
--- a/src/native_device_api/sycl/export.cpp
+++ b/src/native_device_api/sycl/export.cpp
@@ -15,7 +15,7 @@
 */
 #include "oneapi/ccl/config.h"
 
-#if defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) and !defined(CCL_ENABLE_ZE)
 
 #include "oneapi/ccl/native_device_api/sycl/export.hpp"
 #include "oneapi/ccl/type_traits.hpp"
@@ -66,7 +66,7 @@ CCL_BE_API generic_device_type<cl_backend_type::dpcpp_sycl>::generic_device_type
         std::find_if(platforms.begin(), platforms.end(), [](const cl::sycl::platform& pl) {
             return pl.get_info<cl::sycl::info::platform::name>().find("Level-Zero") !=
                    std::string::npos;
-            //or platform.get_backend() == cl::sycl::backend::level_zero
+            //or platform.get_backend() == cl::sycl::backend::ext_oneapi_level_zero
         });
     if (platform_it == platforms.end()) {
         std::stringstream ss;
@@ -181,4 +181,4 @@ generic_platform_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
     return platform;
 }
 } // namespace ccl
-#endif //CCL_ENABLE_SYCL and !defined(MULTI_GPU_SUPPORT)
+#endif //CCL_ENABLE_SYCL and !defined(CCL_ENABLE_ZE)
diff --git a/src/native_device_api/sycl_l0/export.cpp b/src/native_device_api/sycl_l0/export.cpp
index f27a87dbc..1d635febd 100644
--- a/src/native_device_api/sycl_l0/export.cpp
+++ b/src/native_device_api/sycl_l0/export.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl/config.h"
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
 #include "oneapi/ccl/native_device_api/sycl_l0/export.hpp"
 #include "oneapi/ccl/type_traits.hpp"
@@ -66,7 +66,7 @@ generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(
         std::find_if(platforms.begin(), platforms.end(), [](const cl::sycl::platform& pl) {
             return pl.get_info<cl::sycl::info::platform::name>().find("Level-Zero") !=
                    std::string::npos;
-            //or platform.get_backend() == cl::sycl::backend::level_zero
+            //or platform.get_backend() == cl::sycl::backend::ext_oneapi_level_zero
         });
     if (platform_it == platforms.end()) {
         std::stringstream ss;
@@ -181,4 +181,4 @@ generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
     return platform;
 }
 } // namespace ccl
-#endif //#if defined(CCL_ENABLE_SYCL) && defined (MULTI_GPU_SUPPORT)
+#endif //#if defined(CCL_ENABLE_SYCL) && defined (CCL_ENABLE_ZE)
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index 6c1483a2b..2b1228184 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -113,7 +113,7 @@ ccl::status ccl_parallelizer::process_deps(ccl_master_sched* sched) {
     }
     sched->sync_partial_scheds();
 
-    entry_factory::make_entry<deps_entry>(deps_sched);
+    entry_factory::create<deps_entry>(deps_sched);
     deps_sched->add_barrier();
 
     return ccl::status::success;
@@ -137,10 +137,8 @@ ccl::status ccl_parallelizer::process_pre_post_copies(ccl_master_sched* sched) {
     sched->get_pre_post_copy_counts(d2h_counts, h2d_counts, reuse_buffers);
 
     if ((coll_type == ccl_coll_allgatherv) &&
-        coll_param.is_inplace(ccl_coll_param::buf_type::device)) {
-        CCL_THROW_IF_NOT(coll_param.device_recv_bufs.size() == 1,
-                         "unexpected device_recv_bufs.size ",
-                         coll_param.device_recv_bufs.size());
+        coll_param.is_inplace(ccl_coll_param::buf_type::device) &&
+        (coll_param.device_recv_bufs.size() == 1)) {
         device_in_buf_offset = std::accumulate(
             coll_param.recv_counts.begin(), coll_param.recv_counts.begin() + my_rank, 0);
         LOG_TRACE("device_in_buf_offset = ", device_in_buf_offset);
@@ -160,7 +158,7 @@ ccl::status ccl_parallelizer::process_pre_post_copies(ccl_master_sched* sched) {
             size_t count = d2h_counts[idx];
             size_t bytes = count * dtype_size;
 
-            entry_factory::make_entry<copy_entry>(
+            entry_factory::create<copy_entry>(
                 part_scheds[sched_idx].get(),
                 ccl_buffer(coll_param.get_send_buf_ptr(idx, ccl_coll_param::buf_type::device),
                            bytes,
@@ -183,7 +181,7 @@ ccl::status ccl_parallelizer::process_pre_post_copies(ccl_master_sched* sched) {
             size_t count = h2d_counts[idx];
             size_t bytes = count * dtype_size;
 
-            entry_factory::make_entry<copy_entry>(
+            entry_factory::create<copy_entry>(
                 part_scheds[sched_idx].get(),
                 ccl_buffer(coll_param.get_recv_buf(idx), bytes),
                 ccl_buffer(coll_param.get_recv_buf_ptr(idx, ccl_coll_param::buf_type::device),
@@ -213,7 +211,7 @@ ccl::status ccl_parallelizer::process_output_event(ccl_master_sched* sched) {
     }
     sched->sync_partial_scheds();
 
-    entry_factory::make_entry<ze_event_signal_entry>(part_scheds[0].get(), sched);
+    entry_factory::create<ze_event_signal_entry>(part_scheds[0].get(), sched);
 
     return ccl::status::success;
 }
@@ -286,7 +284,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                 if (part_count < max_data_partition_count)
                     part_count = max_data_partition_count;
             }
-            if (ccl_is_topo_ring_algo(selector_param)) {
+            if (ccl_is_device_side_algo(selector_param)) {
                 part_count = 1;
             }
             break;
@@ -311,9 +309,11 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
         case ccl_coll_allgatherv:
             selector_param.recv_counts = coll_param.recv_counts.data();
             algo.allgatherv = data.algorithm_selector->get<ccl_coll_allgatherv>(selector_param);
+
             if (algo.allgatherv == ccl_coll_allgatherv_direct ||
                 algo.allgatherv == ccl_coll_allgatherv_naive ||
-                algo.allgatherv == ccl_coll_allgatherv_ring) {
+                algo.allgatherv == ccl_coll_allgatherv_ring ||
+                ccl_is_device_side_algo(selector_param)) {
                 part_count = 1;
             }
             else if (algo.allgatherv == ccl_coll_allgatherv_multi_bcast ||
@@ -321,11 +321,13 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                 part_count = comm_size;
                 ag_recv_bufs.resize(comm_size);
                 if (algo.allgatherv == ccl_coll_allgatherv_multi_bcast) {
-                    selector_param.ctype = ccl_coll_bcast;
-                    selector_param.count = sched->coll_param.get_send_count();
-                    selector_param.dtype = dtype;
+                    ccl_selector_param bcast_selector_param;
+                    bcast_selector_param.ctype = ccl_coll_bcast;
+                    bcast_selector_param.count = selector_param.count;
+                    bcast_selector_param.dtype = selector_param.dtype;
+                    bcast_selector_param.comm = selector_param.comm;
                     internal_algo.bcast =
-                        data.algorithm_selector->get<ccl_coll_bcast>(selector_param);
+                        data.algorithm_selector->get<ccl_coll_bcast>(bcast_selector_param);
                     if (internal_algo.bcast == ccl_coll_bcast_direct) {
                         /*
                             group all direct bcasts for specific worker together into single schedule w/o
@@ -415,9 +417,11 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
         case ccl_coll_allgatherv:
             counts[0] = coll_param.get_recv_count(0);
             offsets[0] = 0;
+            selector_param.recv_counts = coll_param.recv_counts.data();
             if (algo.allgatherv == ccl_coll_allgatherv_direct ||
                 algo.allgatherv == ccl_coll_allgatherv_naive ||
-                algo.allgatherv == ccl_coll_allgatherv_ring) {
+                algo.allgatherv == ccl_coll_allgatherv_ring ||
+                ccl_is_device_side_algo(selector_param)) {
             }
             else {
                 for (idx = 1; idx < comm_size; idx++) {
@@ -456,6 +460,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                 param.dtype = dtype;
                 param.root = coll_param.root;
                 param.comm = comm;
+                param.stream = coll_param.stream;
                 coll_entry_helper::add_coll_entry<ccl_coll_bcast>(part_scheds[idx].get(), param);
             }
             break;
@@ -504,6 +509,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                 param.dtype = dtype;
                 param.reduction = coll_param.reduction;
                 param.comm = comm;
+                param.stream = coll_param.stream;
                 coll_entry_helper::add_coll_entry<ccl_coll_reduce_scatter>(part_scheds[idx].get(),
                                                                            param);
             }
@@ -519,7 +525,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                                .get_ptr();
                 main_ctx->part_idx = 0;
                 main_ctx->part_count = 1;
-                entry_factory::make_entry<prologue_entry>(
+                entry_factory::create<prologue_entry>(
                     part_scheds[0].get(),
                     coll_attr.prologue_fn,
                     ccl_buffer(coll_param.get_send_buf_ptr(),
@@ -543,7 +549,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                     part_ctx->part_idx = idx;
                     part_ctx->part_count = part_count;
 
-                    entry_factory::make_entry<copy_entry>(
+                    entry_factory::create<copy_entry>(
                         part_scheds[idx].get(),
                         ccl_buffer(main_ctx, sizeof(ccl_parallelizer_prologue_ctx)),
                         ccl_buffer(part_ctx, sizeof(ccl_parallelizer_prologue_ctx)),
@@ -596,7 +602,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
             if (coll_attr.prologue_fn && !coll_attr.epilogue_fn) {
                 sched->sync_partial_scheds();
 
-                auto entry = entry_factory::make_entry<copy_entry>(
+                auto entry = entry_factory::create<copy_entry>(
                     part_scheds[0].get(),
                     ccl_buffer(), /* in_buf */
                     ccl_buffer(coll_param.get_recv_buf_ptr(),
@@ -614,7 +620,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
 
             if (coll_attr.epilogue_fn) {
                 sched->sync_partial_scheds();
-                auto entry = entry_factory::make_entry<epilogue_entry>(
+                auto entry = entry_factory::create<epilogue_entry>(
                     part_scheds[0].get(),
                     coll_attr.epilogue_fn,
                     ccl_buffer(coll_param.get_recv_buf_ptr(),
@@ -640,9 +646,11 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
         }
 
         case ccl_coll_allgatherv: {
+            selector_param.recv_counts = coll_param.recv_counts.data();
             if (algo.allgatherv == ccl_coll_allgatherv_direct ||
                 algo.allgatherv == ccl_coll_allgatherv_naive ||
-                algo.allgatherv == ccl_coll_allgatherv_ring) {
+                algo.allgatherv == ccl_coll_allgatherv_ring ||
+                ccl_is_device_side_algo(selector_param)) {
                 ccl_coll_entry_param param{};
                 param.ctype = ccl_coll_allgatherv;
                 param.send_buf = ccl_buffer(coll_param.get_send_buf_ptr(),
@@ -654,6 +662,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
                 param.recv_counts = coll_param.recv_counts.data();
                 param.dtype = dtype;
                 param.comm = comm;
+                param.stream = coll_param.stream;
                 coll_entry_helper::add_coll_entry<ccl_coll_allgatherv>(part_scheds[0].get(), param);
             }
             else {
@@ -757,7 +766,7 @@ ccl::status ccl_parallelizer::process_base(ccl_master_sched* sched) {
 
                 sched->sync_partial_scheds();
 
-                auto entry = entry_factory::make_entry<sparse_allreduce_completion_entry>(
+                auto entry = entry_factory::create<sparse_allreduce_completion_entry>(
                     part_scheds[0].get(),
                     coll_attr.sparse_allreduce_completion_fn,
                     coll_attr.sparse_allreduce_fn_ctx,
diff --git a/src/sched/buffer_cache.cpp b/src/sched/buffer/buffer_cache.cpp
similarity index 99%
rename from src/sched/buffer_cache.cpp
rename to src/sched/buffer/buffer_cache.cpp
index f53a77c57..c917e61bd 100644
--- a/src/sched/buffer_cache.cpp
+++ b/src/sched/buffer/buffer_cache.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "sched/buffer_cache.hpp"
+#include "sched/buffer/buffer_cache.hpp"
 
 namespace ccl {
 
diff --git a/src/sched/buffer_cache.hpp b/src/sched/buffer/buffer_cache.hpp
similarity index 100%
rename from src/sched/buffer_cache.hpp
rename to src/sched/buffer/buffer_cache.hpp
diff --git a/src/sched/buffer/buffer_manager.cpp b/src/sched/buffer/buffer_manager.cpp
new file mode 100644
index 000000000..cf1e88507
--- /dev/null
+++ b/src/sched/buffer/buffer_manager.cpp
@@ -0,0 +1,199 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "sched/buffer/buffer_cache.hpp"
+#include "sched/buffer/buffer_manager.hpp"
+
+#ifdef CCL_ENABLE_ZE
+#include "sched/entry/ze/ze_cache.hpp"
+#endif // CCL_ENABLE_ZE
+
+namespace ccl {
+
+std::string to_string(buffer_type type) {
+    switch (type) {
+        case buffer_type::regular: return "regular";
+        case buffer_type::sycl: return "sycl";
+        case buffer_type::ze: return "ze";
+        default: return "unknown";
+    }
+}
+
+std::string to_string(buffer_place place) {
+    switch (place) {
+        case buffer_place::host: return "host";
+        case buffer_place::device: return "device";
+        case buffer_place::shared: return "shared";
+        default: return "unknown";
+    }
+}
+
+std::string alloc_param::to_string() const {
+    std::stringstream ss;
+
+    ss << "{ bytes: " << bytes << ", type: " << ccl::to_string(buf_type)
+       << ", place: " << ccl::to_string(buf_place) << ", is_managed: " << is_managed;
+
+    if (stream) {
+        ss << ", stream: " << stream->to_string();
+    }
+
+#ifdef CCL_ENABLE_SYCL
+    if (hint_ptr) {
+        ss << ", hint_ptr: " << hint_ptr;
+    }
+#endif // CCL_ENABLE_SYCL
+
+    ss << "}";
+
+    return ss.str();
+}
+
+std::string dealloc_param::to_string() const {
+    std::stringstream ss;
+
+    ss << "{ ptr: " << ptr << ", bytes: " << bytes << ", type: " << ccl::to_string(buf_type);
+
+    if (stream) {
+        ss << ", stream: " << stream->to_string();
+    }
+
+    ss << "}";
+
+    return ss.str();
+}
+
+buffer_manager::~buffer_manager() {
+    clear();
+}
+
+void buffer_manager::clear() {
+    for (auto it = regular_buffers.begin(); it != regular_buffers.end(); it++) {
+        global_data::get().buffer_cache->push(instance_idx, it->bytes, it->ptr);
+    }
+    regular_buffers.clear();
+
+#ifdef CCL_ENABLE_SYCL
+    for (auto it = sycl_buffers.begin(); it != sycl_buffers.end(); it++) {
+        global_data::get().buffer_cache->push(instance_idx, it->bytes, it->ctx, it->ptr);
+    }
+    sycl_buffers.clear();
+#endif // CCL_ENABLE_SYCL
+
+#ifdef CCL_ENABLE_ZE
+    for (auto it = ze_buffers.begin(); it != ze_buffers.end(); it++) {
+        global_data::get().ze_cache->push(instance_idx,
+                                          it->ctx,
+                                          it->dev,
+                                          ze::default_device_mem_alloc_desc,
+                                          it->bytes,
+                                          0,
+                                          it->ptr);
+    }
+    ze_buffers.clear();
+#endif // CCL_ENABLE_ZE
+}
+
+void* buffer_manager::alloc(const alloc_param& param) {
+    LOG_DEBUG("{ idx: ", instance_idx, ", param: ", param.to_string(), " }");
+
+    void* ptr{};
+    size_t bytes = param.bytes;
+
+    CCL_THROW_IF_NOT(bytes > 0, "unexpected request to allocate zero size buffer");
+    CCL_THROW_IF_NOT(
+        param.buf_type != buffer_type::unknown, "unexpected buf_type ", to_string(param.buf_type));
+    CCL_THROW_IF_NOT(param.buf_place != buffer_place::unknown,
+                     "unexpected buf_place ",
+                     to_string(param.buf_place));
+
+    if (param.buf_type == buffer_type::regular) {
+        CCL_THROW_IF_NOT(param.buf_place == buffer_place::host,
+                         "unexpected buf_place ",
+                         to_string(param.buf_place));
+        global_data::get().buffer_cache->get(instance_idx, bytes, &ptr);
+        if (param.is_managed) {
+            regular_buffers.emplace_back(ptr, bytes);
+        }
+    }
+#ifdef CCL_ENABLE_SYCL
+    else if (param.buf_type == buffer_type::sycl) {
+        CCL_THROW_IF_NOT(param.buf_place == buffer_place::host,
+                         "unexpected buf_place ",
+                         to_string(param.buf_place));
+        CCL_THROW_IF_NOT(param.stream, "null stream");
+        sycl::context sycl_ctx = param.stream->get_native_stream().get_context();
+        global_data::get().buffer_cache->get(instance_idx, bytes, sycl_ctx, &ptr);
+        if (param.is_managed) {
+            sycl_buffers.emplace_back(ptr, bytes, sycl_ctx);
+        }
+    }
+#endif // CCL_ENABLE_SYCL
+#ifdef CCL_ENABLE_ZE
+    else if (param.buf_type == buffer_type::ze) {
+        CCL_THROW_IF_NOT(param.buf_place == buffer_place::device,
+                         "unexpected buf_place ",
+                         to_string(param.buf_place));
+        CCL_THROW_IF_NOT(param.stream, "null stream");
+
+        auto context = param.stream->get_ze_context();
+        auto device = param.stream->get_ze_device();
+        global_data::get().ze_cache->get(
+            instance_idx, context, device, ze::default_device_mem_alloc_desc, bytes, 0, &ptr);
+        if (param.is_managed) {
+            ze_buffers.emplace_back(ptr, bytes, context, device);
+        }
+    }
+#endif // CCL_ENABLE_ZE
+
+    CCL_THROW_IF_NOT(ptr, "null pointer");
+
+    return ptr;
+}
+
+void buffer_manager::dealloc(const dealloc_param& param) {
+    LOG_DEBUG("{ idx: ", instance_idx, ", param: ", param.to_string(), " }");
+
+    void* ptr = param.ptr;
+    size_t bytes = param.bytes;
+
+    CCL_THROW_IF_NOT(ptr, "unexpected request to deallocate null ptr");
+    CCL_THROW_IF_NOT(bytes > 0, "unexpected request to deallocate zero size buffer");
+    CCL_THROW_IF_NOT(
+        param.buf_type != buffer_type::unknown, "unexpected buf_type ", to_string(param.buf_type));
+
+    if (param.buf_type == buffer_type::regular) {
+        global_data::get().buffer_cache->push(instance_idx, bytes, ptr);
+    }
+#ifdef CCL_ENABLE_SYCL
+    else if (param.buf_type == buffer_type::sycl) {
+        CCL_THROW_IF_NOT(param.stream, "null stream");
+        sycl::context sycl_ctx = param.stream->get_native_stream().get_context();
+        ccl::global_data::get().buffer_cache->push(instance_idx, bytes, sycl_ctx, ptr);
+    }
+#endif // CCL_ENABLE_SYCL
+#ifdef CCL_ENABLE_ZE
+    else if (param.buf_type == buffer_type::ze) {
+        CCL_THROW_IF_NOT(param.stream, "null stream");
+        auto context = param.stream->get_ze_context();
+        auto device = param.stream->get_ze_device();
+        global_data::get().ze_cache->push(
+            instance_idx, context, device, ze::default_device_mem_alloc_desc, bytes, 0, ptr);
+    }
+#endif // CCL_ENABLE_ZE
+}
+
+} // namespace ccl
diff --git a/src/sched/buffer/buffer_manager.hpp b/src/sched/buffer/buffer_manager.hpp
new file mode 100644
index 000000000..7a74c10e1
--- /dev/null
+++ b/src/sched/buffer/buffer_manager.hpp
@@ -0,0 +1,158 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifdef CCL_ENABLE_ZE
+#include <ze_api.h>
+#endif // CCL_ENABLE_ZE
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif // CCL_ENABLE_SYCL
+#include <list>
+
+#include "common/stream/stream.hpp"
+#include "common/utils/buffer.hpp"
+
+namespace ccl {
+
+enum class buffer_type : int { regular, sycl, ze, unknown };
+
+enum class buffer_place : int { host, device, shared, unknown };
+
+std::string to_string(buffer_type type);
+std::string to_string(buffer_place place);
+
+struct alloc_param {
+    size_t bytes;
+    buffer_type buf_type;
+    buffer_place buf_place;
+    bool is_managed;
+    ccl_stream* stream;
+    void* hint_ptr;
+
+    alloc_param() = delete;
+
+    alloc_param(size_t bytes,
+#ifdef CCL_ENABLE_SYCL
+                buffer_type buf_type = buffer_type::unknown,
+                buffer_place buf_place = buffer_place::unknown,
+#else // CCL_ENABLE_SYCL
+                buffer_type buf_type = buffer_type::regular,
+                buffer_place buf_place = buffer_place::host,
+#endif // CCL_ENABLE_SYCL
+                bool is_managed = true,
+                ccl_stream* stream = nullptr,
+                void* hint_ptr = nullptr)
+            : bytes(bytes),
+              buf_type(buf_type),
+              buf_place(buf_place),
+              is_managed(is_managed),
+              stream(stream),
+              hint_ptr(hint_ptr) {
+    }
+
+    alloc_param(size_t bytes, void* ptr) : alloc_param(bytes) {
+        hint_ptr = ptr;
+    }
+
+    alloc_param(size_t bytes, ccl_buffer buf) : alloc_param(bytes, buf.get_ptr()) {}
+
+    std::string to_string() const;
+};
+
+struct dealloc_param {
+    void* ptr;
+    size_t bytes;
+    buffer_type buf_type;
+    ccl_stream* stream;
+
+    dealloc_param() = delete;
+
+    dealloc_param(void* ptr, size_t bytes, buffer_type buf_type, ccl_stream* stream = nullptr)
+            : ptr(ptr),
+              bytes(bytes),
+              buf_type(buf_type),
+              stream(stream) {}
+
+    std::string to_string() const;
+};
+
+struct buffer_desc {
+    void* ptr;
+    size_t bytes;
+
+    buffer_desc() = delete;
+    buffer_desc(void* ptr, size_t bytes) : ptr(ptr), bytes(bytes) {}
+};
+
+#ifdef CCL_ENABLE_SYCL
+struct sycl_buffer_desc : public buffer_desc {
+    const sycl::context ctx;
+
+    sycl_buffer_desc() = delete;
+    sycl_buffer_desc(void* ptr, size_t bytes, const sycl::context& ctx)
+            : buffer_desc(ptr, bytes),
+              ctx(ctx) {}
+};
+#endif // CCL_ENABLE_SYCL
+
+#ifdef CCL_ENABLE_ZE
+struct ze_buffer_desc : public buffer_desc {
+    const ze_context_handle_t ctx;
+    const ze_device_handle_t dev;
+
+    ze_buffer_desc() = delete;
+    ze_buffer_desc(void* ptr,
+                   size_t bytes,
+                   const ze_context_handle_t& ctx,
+                   const ze_device_handle_t& dev)
+            : buffer_desc(ptr, bytes),
+              ctx(ctx),
+              dev(dev) {}
+};
+#endif // CCL_ENABLE_ZE
+
+class buffer_manager {
+public:
+    buffer_manager() = default;
+    buffer_manager(const buffer_manager&) = delete;
+    buffer_manager& operator=(const buffer_manager&) = delete;
+    ~buffer_manager();
+
+    void init(int idx) {
+        instance_idx = idx;
+    }
+
+    void clear();
+
+    void* alloc(const alloc_param& param);
+    void dealloc(const dealloc_param& param);
+
+private:
+    size_t instance_idx{};
+
+    std::list<buffer_desc> regular_buffers;
+
+#ifdef CCL_ENABLE_SYCL
+    std::list<sycl_buffer_desc> sycl_buffers;
+#endif // CCL_ENABLE_SYCL
+
+#ifdef CCL_ENABLE_ZE
+    std::list<ze_buffer_desc> ze_buffers;
+#endif // CCL_ENABLE_ZE
+};
+
+} // namespace ccl
diff --git a/src/sched/cache/key.cpp b/src/sched/cache/key.cpp
index 6f232d42f..3ea84b0f4 100644
--- a/src/sched/cache/key.cpp
+++ b/src/sched/cache/key.cpp
@@ -90,7 +90,7 @@ bool ccl_sched_key::check(const ccl_coll_param& param, const ccl_coll_attr& attr
 
     result &= (attr.prologue_fn == f.prologue_fn || attr.epilogue_fn == f.epilogue_fn ||
                attr.reduction_fn == f.reduction_fn || param.ctype == f.ctype ||
-               param.dtype.idx() == f.dtype || param.comm == f.comm);
+               param.dtype == f.dtype || param.comm == f.comm);
 
     switch (f.ctype) {
         case ccl_coll_allgatherv:
diff --git a/src/sched/cache/key.hpp b/src/sched/cache/key.hpp
index e8dae9a40..8fd6161f8 100644
--- a/src/sched/cache/key.hpp
+++ b/src/sched/cache/key.hpp
@@ -57,7 +57,7 @@ class ccl_sched_key {
     bool has_hasher_result = false;
 
     struct ccl_sched_key_inner_fields {
-        ccl_coll_type ctype = ccl_coll_internal;
+        ccl_coll_type ctype = ccl_coll_undefined;
         void* buf1 = nullptr; /* non-data buffer which can be used for caching */
         void* buf2 = nullptr; /* non-data buffer which can be used for caching */
         ccl::datatype dtype = ccl::datatype::int8;
diff --git a/src/sched/entry/chain_call_entry.hpp b/src/sched/entry/chain_call_entry.hpp
index 76ad23949..57af2cf5e 100644
--- a/src/sched/entry/chain_call_entry.hpp
+++ b/src/sched/entry/chain_call_entry.hpp
@@ -35,8 +35,8 @@ class chain_call_entry : public sched_entry {
         if (name) {
             LOG_DEBUG("entry object name: ", name);
         }
-        work_sched.reset(new ccl_extra_sched(sched->coll_param, sched->sched_id));
-        work_sched->coll_param.ctype = ccl_coll_internal;
+        work_sched.reset(new ccl_extra_sched({ sched->sched_id, sched->coll_param }));
+        work_sched->coll_param.ctype = ccl_coll_undefined;
         sched_fill_function(work_sched.get());
     }
 
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 14467a06c..1a53ad04b 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -27,7 +27,7 @@ void coll_entry::start() {
         coll_param.ctype = sched->coll_param.ctype;
         coll_param.comm = sched->coll_param.comm;
         coll_param.stream = sched->coll_param.stream;
-        coll_sched.reset(new ccl_extra_sched(coll_param, sched->sched_id));
+        coll_sched.reset(new ccl_extra_sched({ sched->sched_id, coll_param }, sched->master_sched));
         coll_sched->set_op_id(coll_sched_op_id);
 
         auto res = coll_entry_helper::build_schedule(coll_sched.get(), sched, param);
diff --git a/src/sched/entry/coll/coll_entry_helper.cpp b/src/sched/entry/coll/coll_entry_helper.cpp
index b878d73f8..3d39e2859 100644
--- a/src/sched/entry/coll/coll_entry_helper.cpp
+++ b/src/sched/entry/coll/coll_entry_helper.cpp
@@ -14,25 +14,14 @@
  limitations under the License.
 */
 #include "sched/entry/coll/coll_entry_helper.hpp"
+#include "sched/entry/subsched_entry.hpp"
 
 ccl::status coll_entry_helper::build_schedule(ccl_sched* sched,
                                               const ccl_sched* parent_sched,
                                               const ccl_coll_entry_param& param) {
     ccl::status res = ccl::status::success;
 
-    if (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
-        param.ctype == ccl_coll_reduce_scatter) {
-        if (sched != parent_sched) {
-            sched->coll_attr.reduction_fn = parent_sched->coll_attr.reduction_fn;
-            /* required to create ccl_fn_context in reduce/recv_reduce entries */
-            sched->coll_attr.match_id = parent_sched->coll_attr.match_id;
-        }
-    }
-    sched->coll_attr.to_cache = parent_sched->coll_attr.to_cache;
-
-#ifdef CCL_ENABLE_SYCL
-    sched->coll_attr.is_sycl_buf = parent_sched->coll_attr.is_sycl_buf;
-#endif // CCL_ENABLE_SYCL
+    subsched_entry::inherit_params(sched, parent_sched, param.ctype);
 
     sched->hint_algo = param.hint_algo;
 
diff --git a/src/sched/entry/coll/coll_entry_helper.hpp b/src/sched/entry/coll/coll_entry_helper.hpp
index f1d8860f4..837d450d9 100644
--- a/src/sched/entry/coll/coll_entry_helper.hpp
+++ b/src/sched/entry/coll/coll_entry_helper.hpp
@@ -37,13 +37,14 @@ class coll_entry_helper {
         selector_param.dtype = param.dtype;
         selector_param.comm = param.comm;
         selector_param.stream = param.stream;
+        selector_param.buf = (param.send_buf) ? param.send_buf.get_ptr() : param.recv_buf.get_ptr();
         selector_param.is_vector_buf = sched->coll_attr.is_vector_buf;
 #ifdef CCL_ENABLE_SYCL
         selector_param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
         selector_param.hint_algo = param.hint_algo;
 
-        if (ccl_is_topo_ring_algo(selector_param)) {
+        if (ccl_is_device_side_algo(selector_param)) {
             sched->strict_order = true;
         }
 
@@ -65,7 +66,7 @@ class coll_entry_helper {
         }
 
         /* for remaining cases use regular coll_entry to get schedule filling offload */
-        return entry_factory::make_entry<coll_entry>(sched, param);
+        return entry_factory::create<coll_entry>(sched, param);
     }
 
     static ccl::status build_schedule(ccl_sched* sched,
diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/sched/entry/coll/coll_entry_param.hpp
index 88db2aea5..b0bf58ac8 100644
--- a/src/sched/entry/coll/coll_entry_param.hpp
+++ b/src/sched/entry/coll/coll_entry_param.hpp
@@ -18,17 +18,17 @@
 #include "coll/coll.hpp"
 
 struct ccl_coll_entry_param {
-    ccl_coll_type ctype;
-    ccl_buffer send_buf;
-    ccl_buffer recv_buf;
-    size_t count;
-    size_t send_count;
-    const size_t* send_counts;
-    const size_t* recv_counts;
-    ccl_datatype dtype;
-    ccl::reduction reduction;
-    int root;
-    ccl_comm* comm;
-    ccl_stream* stream;
-    ccl_coll_algo hint_algo;
+    ccl_coll_type ctype{ ccl_coll_last_value };
+    ccl_buffer send_buf{};
+    ccl_buffer recv_buf{};
+    size_t count{};
+    size_t send_count{};
+    const size_t* send_counts{};
+    const size_t* recv_counts{};
+    ccl_datatype dtype{};
+    ccl::reduction reduction{ ccl::reduction::sum };
+    int root{};
+    ccl_comm* comm{};
+    ccl_stream* stream{};
+    ccl_coll_algo hint_algo{};
 };
diff --git a/src/sched/entry/coll/direct/allgatherv_entry.hpp b/src/sched/entry/coll/direct/allgatherv_entry.hpp
index bf9871c0c..03dfe5f4c 100644
--- a/src/sched/entry/coll/direct/allgatherv_entry.hpp
+++ b/src/sched/entry/coll/direct/allgatherv_entry.hpp
@@ -64,13 +64,13 @@ class allgatherv_entry : public base_coll_entry {
         }
 
         LOG_DEBUG("ALLGATHERV entry req ", &req, ", send_bytes ", send_bytes);
-        atl_status_t atl_status = comm->atl->atl_ep_allgatherv(sched->bin->get_atl_ep(),
-                                                               send_buf.get_ptr(send_bytes),
-                                                               send_bytes,
-                                                               recv_buf.get_ptr(sum_recv_bytes),
-                                                               recv_bytes,
-                                                               offsets,
-                                                               &req);
+        atl_status_t atl_status = comm->get_atl_comm()->allgatherv(sched->bin->get_atl_ep(),
+                                                                   send_buf.get_ptr(send_bytes),
+                                                                   send_bytes,
+                                                                   recv_buf.get_ptr(sum_recv_bytes),
+                                                                   recv_bytes,
+                                                                   offsets,
+                                                                   &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLGATHERV entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -80,15 +80,13 @@ class allgatherv_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLGATHERV entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             status = ccl_sched_entry_status_complete;
         }
     }
diff --git a/src/sched/entry/coll/direct/allreduce_entry.hpp b/src/sched/entry/coll/direct/allreduce_entry.hpp
index 0def79110..fe6837c5b 100644
--- a/src/sched/entry/coll/direct/allreduce_entry.hpp
+++ b/src/sched/entry/coll/direct/allreduce_entry.hpp
@@ -46,13 +46,13 @@ class allreduce_entry : public base_coll_entry {
         size_t bytes = cnt * dtype.size();
         LOG_DEBUG("ALLREDUCE entry req: ", &req, ", cnt: ", cnt, ", bytes: ", bytes);
         atl_status_t atl_status =
-            comm->atl->atl_ep_allreduce(sched->bin->get_atl_ep(),
-                                        send_buf.get_ptr(bytes),
-                                        recv_buf.get_ptr(bytes),
-                                        cnt,
-                                        static_cast<atl_datatype_t>(dtype.idx()),
-                                        static_cast<atl_reduction_t>(op),
-                                        &req);
+            comm->get_atl_comm()->allreduce(sched->bin->get_atl_ep(),
+                                            send_buf.get_ptr(bytes),
+                                            recv_buf.get_ptr(bytes),
+                                            cnt,
+                                            static_cast<atl_datatype_t>(dtype.idx()),
+                                            static_cast<atl_reduction_t>(op),
+                                            &req);
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLREDUCE entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
@@ -61,15 +61,13 @@ class allreduce_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLREDUCE entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status)
+        if (req.is_completed)
             status = ccl_sched_entry_status_complete;
     }
 
diff --git a/src/sched/entry/coll/direct/alltoall_entry.hpp b/src/sched/entry/coll/direct/alltoall_entry.hpp
index c54fae098..9bc075109 100644
--- a/src/sched/entry/coll/direct/alltoall_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoall_entry.hpp
@@ -45,11 +45,11 @@ class alltoall_entry : public base_coll_entry {
         bytes = cnt * dt_size;
 
         LOG_DEBUG("ALLTOALL entry req ", &req, ", bytes ", bytes);
-        atl_status_t atl_status = comm->atl->atl_ep_alltoall(sched->bin->get_atl_ep(),
-                                                             send_buf.get_ptr(bytes),
-                                                             recv_buf.get_ptr(bytes),
-                                                             bytes,
-                                                             &req);
+        atl_status_t atl_status = comm->get_atl_comm()->alltoall(sched->bin->get_atl_ep(),
+                                                                 send_buf.get_ptr(bytes),
+                                                                 recv_buf.get_ptr(bytes),
+                                                                 bytes,
+                                                                 &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLTOALL entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -59,15 +59,13 @@ class alltoall_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("ALLTOALL entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             status = ccl_sched_entry_status_complete;
         }
     }
diff --git a/src/sched/entry/coll/direct/alltoallv_entry.hpp b/src/sched/entry/coll/direct/alltoallv_entry.hpp
index 882a38e4b..0d3ccb69a 100644
--- a/src/sched/entry/coll/direct/alltoallv_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoallv_entry.hpp
@@ -21,7 +21,7 @@
 class alltoallv_entry : public base_coll_entry {
 public:
     static constexpr const char* class_name() noexcept {
-        return "alltoallv";
+        return "ALLTOALLV";
     }
 
     alltoallv_entry() = delete;
@@ -79,14 +79,14 @@ class alltoallv_entry : public base_coll_entry {
 
         LOG_DEBUG("alltoallv entry req ", &req, ", sum_send_bytes ", sum_send_bytes);
 
-        atl_status_t atl_status = comm->atl->atl_ep_alltoallv(sched->bin->get_atl_ep(),
-                                                              send_buf.get_ptr(sum_send_bytes),
-                                                              send_bytes,
-                                                              send_offsets,
-                                                              recv_buf.get_ptr(sum_recv_bytes),
-                                                              recv_bytes,
-                                                              recv_offsets,
-                                                              &req);
+        atl_status_t atl_status = comm->get_atl_comm()->alltoallv(sched->bin->get_atl_ep(),
+                                                                  send_buf.get_ptr(sum_send_bytes),
+                                                                  send_bytes,
+                                                                  send_offsets,
+                                                                  recv_buf.get_ptr(sum_recv_bytes),
+                                                                  recv_bytes,
+                                                                  recv_offsets,
+                                                                  &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("alltoallv entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -96,15 +96,13 @@ class alltoallv_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("alltoallv entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             status = ccl_sched_entry_status_complete;
         }
     }
diff --git a/src/sched/entry/coll/direct/barrier_entry.hpp b/src/sched/entry/coll/direct/barrier_entry.hpp
index a5fc457e3..e8fba0973 100644
--- a/src/sched/entry/coll/direct/barrier_entry.hpp
+++ b/src/sched/entry/coll/direct/barrier_entry.hpp
@@ -32,7 +32,7 @@ class barrier_entry : public base_coll_entry {
     void start() override {
         LOG_DEBUG("BARRIER entry req ", &req);
 
-        atl_status_t atl_status = comm->atl->atl_ep_barrier(sched->bin->get_atl_ep(), &req);
+        atl_status_t atl_status = comm->get_atl_comm()->barrier(sched->bin->get_atl_ep(), &req);
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("BARRIER entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
@@ -41,15 +41,13 @@ class barrier_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("BARRIER entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status)
+        if (req.is_completed)
             status = ccl_sched_entry_status_complete;
     }
 
diff --git a/src/sched/entry/coll/direct/bcast_entry.hpp b/src/sched/entry/coll/direct/bcast_entry.hpp
index d41d79b0b..3c867c833 100644
--- a/src/sched/entry/coll/direct/bcast_entry.hpp
+++ b/src/sched/entry/coll/direct/bcast_entry.hpp
@@ -44,7 +44,7 @@ class bcast_entry : public base_coll_entry {
         size_t bytes = cnt * dtype.size();
         LOG_DEBUG("BCAST entry req ", &req, ", bytes ", bytes);
 
-        atl_status_t atl_status = comm->atl->atl_ep_bcast(
+        atl_status_t atl_status = comm->get_atl_comm()->bcast(
             sched->bin->get_atl_ep(), buf.get_ptr(bytes), bytes, root, &req);
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("BCAST entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -54,15 +54,13 @@ class bcast_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("BCAST entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             status = ccl_sched_entry_status_complete;
         }
     }
diff --git a/src/sched/entry/coll/direct/reduce_entry.hpp b/src/sched/entry/coll/direct/reduce_entry.hpp
index a537b2637..0b6276e0a 100644
--- a/src/sched/entry/coll/direct/reduce_entry.hpp
+++ b/src/sched/entry/coll/direct/reduce_entry.hpp
@@ -47,14 +47,15 @@ class reduce_entry : public base_coll_entry {
     void start() override {
         LOG_DEBUG("REDUCE entry req ", &req, ", cnt ", cnt);
         size_t bytes = cnt * dtype.size();
-        atl_status_t atl_status = comm->atl->atl_ep_reduce(sched->bin->get_atl_ep(),
-                                                           send_buf.get_ptr(bytes),
-                                                           recv_buf.get_ptr(bytes),
-                                                           cnt,
-                                                           root,
-                                                           static_cast<atl_datatype_t>(dtype.idx()),
-                                                           static_cast<atl_reduction_t>(op),
-                                                           &req);
+        atl_status_t atl_status =
+            comm->get_atl_comm()->reduce(sched->bin->get_atl_ep(),
+                                         send_buf.get_ptr(bytes),
+                                         recv_buf.get_ptr(bytes),
+                                         cnt,
+                                         root,
+                                         static_cast<atl_datatype_t>(dtype.idx()),
+                                         static_cast<atl_reduction_t>(op),
+                                         &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("REDUCE entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -64,15 +65,13 @@ class reduce_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("REDUCE entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status)
+        if (req.is_completed)
             status = ccl_sched_entry_status_complete;
     }
 
diff --git a/src/sched/entry/coll/direct/reduce_scatter_entry.hpp b/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
index ed29f28b9..089edf6b5 100644
--- a/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
+++ b/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
@@ -51,13 +51,13 @@ class reduce_scatter_entry : public base_coll_entry {
         size_t recv_bytes = recv_cnt * dtype.size();
 
         atl_status_t atl_status =
-            comm->atl->atl_ep_reduce_scatter(sched->bin->get_atl_ep(),
-                                             send_buf.get_ptr(send_bytes),
-                                             recv_buf.get_ptr(recv_bytes),
-                                             recv_cnt,
-                                             static_cast<atl_datatype_t>(dtype.idx()),
-                                             static_cast<atl_reduction_t>(op),
-                                             &req);
+            comm->get_atl_comm()->reduce_scatter(sched->bin->get_atl_ep(),
+                                                 send_buf.get_ptr(send_bytes),
+                                                 recv_buf.get_ptr(recv_bytes),
+                                                 recv_cnt,
+                                                 static_cast<atl_datatype_t>(dtype.idx()),
+                                                 static_cast<atl_reduction_t>(op),
+                                                 &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("REDUCE_SCATTER entry failed. atl_status: ", atl_status_to_str(atl_status));
@@ -67,15 +67,13 @@ class reduce_scatter_entry : public base_coll_entry {
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("REDUCE_SCATTER entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status)
+        if (req.is_completed)
             status = ccl_sched_entry_status_complete;
     }
 
diff --git a/src/sched/entry/copy/copy_entry.cpp b/src/sched/entry/copy/copy_entry.cpp
index 6d350ad19..1318f0ef3 100644
--- a/src/sched/entry/copy/copy_entry.cpp
+++ b/src/sched/entry/copy/copy_entry.cpp
@@ -18,7 +18,8 @@
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
-#include <CL/sycl/backend/level_zero.hpp>
+#include <CL/sycl/backend_types.hpp>
+#include "common/utils/sycl_utils.hpp"
 #endif // CCL_ENABLE_SYCL
 
 copy_entry::copy_entry(ccl_sched* sched,
@@ -28,11 +29,11 @@ copy_entry::copy_entry(ccl_sched* sched,
                        const ccl_datatype& dtype,
                        copy_attr attr)
         :
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
           ze_copy_entry(sched, in_buf, out_buf, count, dtype, attr),
-#else
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
           sched_entry(sched),
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
           sched(sched),
           in_buf(in_buf),
           out_buf(out_buf),
@@ -105,7 +106,7 @@ void copy_entry::start() {
     }
 
 #ifdef CCL_ENABLE_SYCL
-    if (q->get_backend() != cl::sycl::backend::level_zero || is_sycl_buf) {
+    if (q->get_backend() != ccl::utils::get_level_zero_backend() || is_sycl_buf) {
         ctype = copy_type::sycl;
         if (!is_sycl_buf) {
             if ((in_ptr_type != sycl::usm::alloc::device) &&
@@ -115,18 +116,24 @@ void copy_entry::start() {
             }
         }
 
-        copier = sycl_copier(
-            attr.direction, in_buf, out_buf, count, dtype, is_sycl_buf, attr.in_buf_offset);
+        copier = sycl_copier(attr.direction,
+                             in_buf,
+                             out_buf,
+                             count,
+                             dtype,
+                             is_sycl_buf,
+                             attr.in_buf_offset,
+                             attr.out_buf_offset);
         copier.set_queue(q);
         ccl_tuple_for_each_indexed<ccl_sycl_buffer_one_dim_types>(copier);
         status = ccl_sched_entry_status_started;
     }
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     else {
         ctype = copy_type::ze;
         ze_copy_entry::start(); // status
     }
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
 }
 
@@ -137,17 +144,18 @@ void copy_entry::update() {
             status = ccl_sched_entry_status_complete;
         }
     }
-#ifdef MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
     else {
         ze_copy_entry::update();
     }
-#endif // MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
 }
 
 void copy_entry::do_regular_copy() {
     size_t bytes = dtype.size() * count;
-    auto comp_status = ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), count, dtype);
+    auto comp_status =
+        ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), bytes, attr.use_nontemporal);
     CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
     status = ccl_sched_entry_status_complete;
 }
diff --git a/src/sched/entry/copy/copy_entry.hpp b/src/sched/entry/copy/copy_entry.hpp
index 4d0ce48bb..290545b31 100644
--- a/src/sched/entry/copy/copy_entry.hpp
+++ b/src/sched/entry/copy/copy_entry.hpp
@@ -18,17 +18,17 @@
 #include "sched/entry/copy/copy_helper.hpp"
 #include "sched/entry/entry.hpp"
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-#include "sched/entry/gpu/ze_copy_entry.hpp"
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "sched/entry/ze/ze_copy_entry.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 enum class copy_type : int { regular, sycl, ze };
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 class copy_entry : public ze_copy_entry,
 #else
 class copy_entry : public sched_entry,
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
                    public postponed_fields<copy_entry,
                                            ccl_sched_entry_field_in_buf,
                                            ccl_sched_entry_field_cnt,
@@ -70,6 +70,8 @@ class copy_entry : public sched_entry,
                            out_buf,
                            ", in_buf_offset ",
                            attr.in_buf_offset,
+                           ", out_buf_offset ",
+                           attr.out_buf_offset,
                            "\n");
     }
 
@@ -80,10 +82,12 @@ class copy_entry : public sched_entry,
     const size_t count;
     const ccl_datatype dtype;
     copy_attr attr;
-    copy_type ctype{ copy_type::regular };
 
 #ifdef CCL_ENABLE_SYCL
     sycl_copier copier{};
+#ifdef CCL_ENABLE_ZE
+    copy_type ctype{ copy_type::regular };
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
 
     void do_regular_copy();
diff --git a/src/sched/entry/copy/copy_helper.cpp b/src/sched/entry/copy/copy_helper.cpp
index f5d2db4e3..8db75901f 100644
--- a/src/sched/entry/copy/copy_helper.cpp
+++ b/src/sched/entry/copy/copy_helper.cpp
@@ -19,19 +19,19 @@ copy_attr::copy_attr(int peer_rank,
                      size_t peer_buf_idx,
                      copy_direction direction,
                      ccl_comm* map_comm,
-                     size_t in_buf_offset)
+                     size_t in_buf_offset,
+                     size_t out_buf_offset)
         : peer_rank(peer_rank),
           peer_buf_idx(peer_buf_idx),
           direction(direction),
           map_comm(map_comm),
-          in_buf_offset(in_buf_offset) {}
+          in_buf_offset(in_buf_offset),
+          out_buf_offset(out_buf_offset) {}
 
-copy_attr::copy_attr(copy_direction direction, size_t in_buf_offset)
-        : peer_rank(ccl_comm::invalid_rank),
-          peer_buf_idx(0),
-          direction(direction),
-          map_comm(nullptr),
-          in_buf_offset(in_buf_offset) {}
+copy_attr::copy_attr(copy_direction direction, size_t in_buf_offset, size_t out_buf_offset)
+        : direction(direction),
+          in_buf_offset(in_buf_offset),
+          out_buf_offset(out_buf_offset) {}
 
 using copy_direction_str_enum =
     utils::enum_to_str<utils::enum_to_underlying(copy_direction::d2d) + 1>;
diff --git a/src/sched/entry/copy/copy_helper.hpp b/src/sched/entry/copy/copy_helper.hpp
index 1e3666885..5222e40a3 100644
--- a/src/sched/entry/copy/copy_helper.hpp
+++ b/src/sched/entry/copy/copy_helper.hpp
@@ -27,19 +27,24 @@ enum class copy_direction { undefined, h2h, d2h, h2d, d2d };
 std::string to_string(copy_direction val);
 
 struct copy_attr {
-    int peer_rank;
-    size_t peer_buf_idx;
-    copy_direction direction;
-    ccl_comm* map_comm;
-    size_t in_buf_offset;
-
-    copy_attr(int peer_rank = ccl_comm::invalid_rank,
-              size_t peer_buf_idx = 0,
-              copy_direction direction = copy_direction::undefined,
+    int peer_rank = ccl_comm::invalid_rank;
+    size_t peer_buf_idx = 0;
+    copy_direction direction = copy_direction::undefined;
+    ccl_comm* map_comm = nullptr;
+    size_t in_buf_offset = 0;
+    size_t out_buf_offset = 0;
+    bool use_nontemporal = false;
+
+    copy_attr() {}
+
+    copy_attr(int peer_rank,
+              size_t peer_buf_idx,
+              copy_direction direction,
               ccl_comm* map_comm = nullptr,
-              size_t in_buf_offset = 0);
+              size_t in_buf_offset = 0,
+              size_t out_buf_offset = 0);
 
-    copy_attr(copy_direction direction, size_t in_buf_offset = 0);
+    copy_attr(copy_direction direction, size_t in_buf_offset = 0, size_t out_buf_offset = 0);
 };
 
 #ifdef CCL_ENABLE_SYCL
@@ -52,14 +57,16 @@ struct sycl_copier {
                 size_t count,
                 const ccl_datatype& dtype,
                 bool is_sycl_buf = false,
-                size_t in_buf_offset = 0)
+                size_t in_buf_offset = 0,
+                size_t out_buf_offset = 0)
             : direction(direction),
               in_buf(in_buf),
               out_buf(out_buf),
               count(count),
               dtype(dtype),
               is_sycl_buf(is_sycl_buf),
-              in_buf_offset(in_buf_offset) {}
+              in_buf_offset(in_buf_offset),
+              out_buf_offset(out_buf_offset) {}
 
     bool is_completed() {
         return (e.get_info<sycl::info::event::command_execution_status>() ==
@@ -91,7 +98,8 @@ struct sycl_copier {
             if (direction == copy_direction::d2d) {
                 CCL_THROW_IF_NOT(!is_sycl_buf, "D2D + SYCL buffer");
                 e = q->submit([&](sycl::handler& h) {
-                    h.memcpy(out_buf_ptr,
+                    h.memcpy(static_cast<typename specific_sycl_buffer::value_type*>(out_buf_ptr) +
+                                 out_buf_offset,
                              static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) +
                                  in_buf_offset,
                              bytes);
@@ -114,6 +122,8 @@ struct sycl_copier {
                       count,
                       ", in_buf_offset: ",
                       in_buf_offset,
+                      ", out_buf_offset: ",
+                      out_buf_offset,
                       ", dtype_size: ",
                       dtype.size(),
                       ", bytes: ",
@@ -145,13 +155,15 @@ struct sycl_copier {
                     auto& dst_buf = (direction == copy_direction::h2d) ? *device_buf_ptr : host_buf;
                     auto src_buf_acc = src_buf.template get_access<sycl::access::mode::read>(
                         h, count, in_buf_offset);
-                    auto dst_buf_acc = dst_buf.template get_access<sycl::access::mode::write>(h);
+                    auto dst_buf_acc = dst_buf.template get_access<sycl::access::mode::write>(
+                        h, count, out_buf_offset);
                     h.copy(src_buf_acc, dst_buf_acc);
                 });
             }
             else {
                 /* don't do special cast, provided USM pointer can be used as is in copy kernel */
-                e = q->memcpy(out_buf_ptr,
+                e = q->memcpy(static_cast<typename specific_sycl_buffer::value_type*>(out_buf_ptr) +
+                                  out_buf_offset,
                               static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) +
                                   in_buf_offset,
                               bytes);
@@ -178,6 +190,7 @@ struct sycl_copier {
     bool is_sycl_buf;
     sycl::queue* q;
     size_t in_buf_offset;
+    size_t out_buf_offset;
     sycl::event e;
 };
 
diff --git a/src/sched/entry/deps_entry.hpp b/src/sched/entry/deps_entry.hpp
index 8e23ebb69..61afbe7e2 100644
--- a/src/sched/entry/deps_entry.hpp
+++ b/src/sched/entry/deps_entry.hpp
@@ -13,6 +13,12 @@ class deps_entry : public sched_entry {
 
     void start() override {
         status = ccl_sched_entry_status_started;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        if (ccl::global_data::env().enable_kernel_profile && sched->coll_param.stream) {
+            sched->master_sched->get_kernel_timer().set_deps_start_time(
+                ccl::ze::calculate_global_time(sched->coll_param.stream->get_ze_device()));
+        }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     }
 
     void update() override {
@@ -29,6 +35,12 @@ class deps_entry : public sched_entry {
 
         if (all_completed) {
             status = ccl_sched_entry_status_complete;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+            if (ccl::global_data::env().enable_kernel_profile && sched->coll_param.stream) {
+                sched->master_sched->get_kernel_timer().set_deps_end_time(
+                    ccl::ze::calculate_global_time(sched->coll_param.stream->get_ze_device()));
+            }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         }
     }
 
diff --git a/src/sched/entry/deregister_entry.hpp b/src/sched/entry/deregister_entry.hpp
index 26515829a..774d99b06 100644
--- a/src/sched/entry/deregister_entry.hpp
+++ b/src/sched/entry/deregister_entry.hpp
@@ -36,7 +36,7 @@ class deregister_entry : public sched_entry {
         std::list<atl_mr_t*>::iterator it;
         for (it = mr_list.begin(); it != mr_list.end(); it++) {
             LOG_DEBUG("deregister mr ", *it);
-            atl_status = comm->atl->atl_mr_dereg(*it);
+            atl_status = comm->get_atl_comm()->mr_dereg(*it);
             if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
                 CCL_THROW("DEREGISTER entry failed. atl_status: ", atl_status_to_str(atl_status));
             }
diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp
index 5ca1b0bcb..9d3d4f2e2 100644
--- a/src/sched/entry/entry.hpp
+++ b/src/sched/entry/entry.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "atl/atl.h"
+#include "atl/atl_base_comm.hpp"
 #include "common/datatype/datatype.hpp"
 #include "common/utils/utils.hpp"
 #include "sched/sched_timer.hpp"
@@ -79,6 +79,9 @@ class alignas(CACHELINE_SIZE) sched_entry {
 
     ccl::sched_timer timer;
 
+    virtual void init(){};
+    virtual void finalize(){};
+
 protected:
     virtual void start() = 0;
     virtual void update();
diff --git a/src/sched/entry/factory/chunked_entry_factory.cpp b/src/sched/entry/factory/chunked_entry_factory.cpp
index 44060dd17..db3e6d4cb 100644
--- a/src/sched/entry/factory/chunked_entry_factory.cpp
+++ b/src/sched/entry/factory/chunked_entry_factory.cpp
@@ -26,7 +26,7 @@ void make_chunked_send_entry(ccl_sched* sched,
         "send",
         dtype,
         cnt,
-        make_entry<send_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, dst, comm),
+        create<send_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, dst, comm),
         { chunk_sched = sched; });
 }
 
@@ -40,33 +40,31 @@ void make_chunked_recv_entry(ccl_sched* sched,
         "recv",
         dtype,
         cnt,
-        make_entry<recv_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, src, comm),
+        create<recv_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, src, comm),
         { chunk_sched = sched; });
 }
 
 void make_chunked_recv_reduce_entry(ccl_sched* sched,
                                     ccl_buffer inout_buf,
                                     size_t cnt,
-                                    size_t* out_cnt,
                                     const ccl_datatype& dtype,
                                     ccl::reduction reduction_op,
                                     int src,
-                                    ccl_buffer comm_buf,
                                     ccl_comm* comm,
+                                    ccl_buffer comm_buf,
                                     ccl_recv_reduce_result_buf_type result_buf_type) {
     CCL_CHUNKED_ENTRY_FUNCTION("recv_reduce",
                                dtype,
                                cnt,
-                               make_entry<recv_reduce_entry>(chunk_sched,
-                                                             inout_buf + chunk_offset,
-                                                             chunk_size,
-                                                             out_cnt,
-                                                             dtype,
-                                                             reduction_op,
-                                                             src,
-                                                             comm_buf + chunk_offset,
-                                                             comm,
-                                                             result_buf_type),
+                               create<recv_reduce_entry>(chunk_sched,
+                                                         inout_buf + chunk_offset,
+                                                         chunk_size,
+                                                         dtype,
+                                                         reduction_op,
+                                                         src,
+                                                         comm,
+                                                         comm_buf + chunk_offset,
+                                                         result_buf_type),
                                { chunk_sched = sched; });
 }
 
@@ -81,7 +79,7 @@ void make_chunked_send_entry(std::vector<ccl_sched*>& scheds,
         "send",
         dtype,
         cnt,
-        make_entry<send_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, dst, comm),
+        create<send_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, dst, comm),
         { chunk_sched = scheds[(first_sched_idx + chunk_idx) % scheds.size()]; });
 }
 
@@ -96,7 +94,7 @@ void make_chunked_recv_entry(std::vector<ccl_sched*>& scheds,
         "recv",
         dtype,
         cnt,
-        make_entry<recv_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, src, comm),
+        create<recv_entry>(chunk_sched, buf + chunk_offset, chunk_size, dtype, src, comm),
         { chunk_sched = scheds[(first_sched_idx + chunk_idx) % scheds.size()]; });
 }
 
@@ -110,7 +108,7 @@ void make_chunked_copy_entry(std::vector<ccl_sched*>& scheds,
         "copy",
         dtype,
         cnt,
-        make_entry<copy_entry>(
+        create<copy_entry>(
             chunk_sched, in_buf + chunk_offset, out_buf + chunk_offset, chunk_size, dtype),
         { chunk_sched = scheds[(first_sched_idx + chunk_idx) % scheds.size()]; });
 }
diff --git a/src/sched/entry/factory/entry_factory.h b/src/sched/entry/factory/entry_factory.h
index ba971b265..c08f4ab83 100644
--- a/src/sched/entry/factory/entry_factory.h
+++ b/src/sched/entry/factory/entry_factory.h
@@ -25,19 +25,19 @@
 // declares interface for all entries creations
 namespace entry_factory {
 template <class EntryType, class... Arguments>
-EntryType* make_entry(ccl_sched* sched, Arguments&&... args);
+EntryType* create(ccl_sched* sched, Arguments&&... args);
 
 namespace detail {
 template <class EntryType>
 struct entry_creator {
     template <class T, class... U>
-    friend T* make_entry(ccl_sched* sched, U&&... args);
+    friend T* create(ccl_sched* sched, U&&... args);
 
     template <class T, ccl_sched_add_mode mode, class... U>
-    friend T* make_entry(ccl_sched* sched, U&&... args);
+    friend T* create(ccl_sched* sched, U&&... args);
 
     template <ccl_sched_add_mode mode, class... Arguments>
-    static EntryType* create(ccl_sched* sched, Arguments&&... args) {
+    static EntryType* make_entry(ccl_sched* sched, Arguments&&... args) {
         return static_cast<EntryType*>(sched->add_entry(
             std::unique_ptr<EntryType>(new EntryType(sched, std::forward<Arguments>(args)...)),
             ccl_sched_base::add_entry_mode_t<mode>()));
diff --git a/src/sched/entry/factory/entry_factory.hpp b/src/sched/entry/factory/entry_factory.hpp
index 90f8c51af..083011b92 100644
--- a/src/sched/entry/factory/entry_factory.hpp
+++ b/src/sched/entry/factory/entry_factory.hpp
@@ -35,6 +35,7 @@
 #include "sched/entry/probe_entry.hpp"
 #include "sched/entry/prologue_entry.hpp"
 #include "sched/entry/recv_entry.hpp"
+#include "sched/entry/recv_copy_entry.hpp"
 #include "sched/entry/recv_reduce_entry.hpp"
 #include "sched/entry/reduce_local_entry.hpp"
 #include "sched/entry/register_entry.hpp"
@@ -45,51 +46,33 @@
 #include "sched/entry/wait_value_entry.hpp"
 #include "sched/entry/write_entry.hpp"
 
-#if defined(MULTI_GPU_SUPPORT) && defined(CCL_ENABLE_SYCL)
-#include "sched/entry/gpu/ze_allreduce_entry.hpp"
-#include "sched/entry/gpu/ze_copy_entry.hpp"
-#include "sched/entry/gpu/ze_handle_exchange_entry.hpp"
-#include "sched/entry/gpu/ze_event_signal_entry.hpp"
-#include "sched/entry/gpu/ze_event_wait_entry.hpp"
-#include "sched/entry/gpu/ze_reduce_entry.hpp"
-#endif // MULTI_GPU_SUPPORT && CCL_ENABLE_SYCL
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+#include "sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp"
+#include "sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp"
+#include "sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp"
+#include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
+#include "sched/entry/ze/ze_a2a_gatherv_entry.hpp"
+#include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
+#include "sched/entry/ze/ze_barrier_entry.hpp"
+#include "sched/entry/ze/ze_copy_entry.hpp"
+#include "sched/entry/ze/ze_handle_exchange_entry.hpp"
+#include "sched/entry/ze/ze_event_signal_entry.hpp"
+#include "sched/entry/ze/ze_event_wait_entry.hpp"
+#include "sched/entry/ze/ze_onesided_reduce_entry.hpp"
+#include "sched/entry/ze/ze_reduce_local_entry.hpp"
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 #include "sched/sched.hpp"
 
 namespace entry_factory {
-/* generic interface for entry creation */
+
 template <class EntryType, class... Arguments>
-EntryType* make_entry(ccl_sched* sched, Arguments&&... args) {
-    LOG_DEBUG("creating ", EntryType::class_name(), " entry");
-    EntryType* new_entry = detail::entry_creator<EntryType>::template create<
+EntryType* create(ccl_sched* sched, Arguments&&... args) {
+    LOG_DEBUG("creating: ", EntryType::class_name(), " entry");
+    EntryType* new_entry = detail::entry_creator<EntryType>::template make_entry<
         ccl_sched_add_mode::ccl_sched_add_mode_last_value>(sched, std::forward<Arguments>(args)...);
-    LOG_DEBUG("created: ", EntryType::class_name(), ", entry: ", new_entry, ", for sched: ", sched);
+    LOG_DEBUG("created: ", EntryType::class_name(), ", entry: ", new_entry, ", sched: ", sched);
     return new_entry;
 }
 
-template <class EntryType, ccl_sched_add_mode mode, class... Arguments>
-EntryType* make_ordered_entry(ccl_sched* sched, Arguments&&... args) {
-    LOG_DEBUG("creating ", EntryType::class_name(), " entry, use mode: ", to_string(mode));
-    return detail::entry_creator<EntryType>::template create<mode>(
-        sched, std::forward<Arguments>(args)...);
-}
-
-/* Example for non-standard entry 'my_non_standard_entry' creation
-    namespace detail
-    {
-        template <>
-        class entry_creator<my_non_standard_entry>
-        {
-            public:
-            static my_non_standard_entry* create(/ *** specific parameters for construction *** /)
-            {
-                auto &&new_entry = std::unique_ptr<my_non_standard_entry>(
-                            new my_non_standard_entry(/ *** specific parameters for construction *** /));
-
-                //Add custom contruction/registration logic, if needed
-
-                return static_cast<my_non_standard_entry*>(sched->add_entry(std::move(new_entry)));
-            }
-        };
-    }*/
 } // namespace entry_factory
diff --git a/src/sched/entry/gpu/ze_base_entry.cpp b/src/sched/entry/gpu/ze_base_entry.cpp
deleted file mode 100644
index a8998634b..000000000
--- a/src/sched/entry/gpu/ze_base_entry.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/stream/stream.hpp"
-#include "sched/queue/queue.hpp"
-
-#include "sched/entry/gpu/ze_base_entry.hpp"
-#include "sched/entry/gpu/ze_cache.hpp"
-#include "sched/entry/gpu/ze_call.hpp"
-#include "ze_primitives.hpp"
-
-#include <CL/sycl/backend/level_zero.hpp>
-
-using namespace ccl;
-using namespace ccl::ze;
-
-ze_base_entry::ze_base_entry(ccl_sched *sched, ccl_comm *comm, uint32_t add_event_count)
-        : sched_entry(sched),
-          sched(sched),
-          comm(comm),
-          add_event_count(add_event_count) {
-    CCL_THROW_IF_NOT(sched, "no sched");
-    if (!comm) {
-        comm = sched->coll_param.comm;
-    }
-    CCL_THROW_IF_NOT(comm, "no comm");
-    comm_rank = comm->rank();
-    comm_size = comm->size();
-}
-
-void ze_base_entry::init(init_mode ze_init_mode) {
-    if (is_initialized) {
-        return;
-    }
-    worker_idx = sched->queue->get_idx();
-
-    CCL_THROW_IF_NOT(sched->coll_param.stream, "null stream");
-
-    LOG_DEBUG("getting a native stream");
-    auto native_stream = sched->coll_param.stream->get_native_stream(worker_idx);
-    if (native_stream->get_backend() != sycl::backend::level_zero) {
-        CCL_THROW("unsupported sycl backend");
-    }
-
-    auto sycl_device = native_stream->get_device();
-    device = sycl_device.template get_native<sycl::backend::level_zero>();
-
-    auto sycl_context = native_stream->get_context();
-    context = sycl_context.template get_native<sycl::backend::level_zero>();
-
-    /* get queue properties */
-    uint32_t num_queue_groups;
-    get_num_queue_groups(device, &num_queue_groups);
-
-    ze_queue_properties_t queue_props;
-    get_queues_properties(device, num_queue_groups, &queue_props);
-
-    /* init compute queue, list */
-    if (init_mode::compute & ze_init_mode) {
-        LOG_DEBUG("compute init mode is enabled");
-        get_comp_primitives(queue_props, comp_primitives);
-        init_primitives(comp_primitives);
-    }
-
-    /* init copy queue, list */
-    if (init_mode::copy & ze_init_mode) {
-        LOG_DEBUG("copy init mode is enabled");
-        get_copy_primitives(queue_props, copy_primitives, ze_init_mode);
-        init_primitives(copy_primitives);
-    }
-
-    /* create event pool */
-    event_pool_desc = default_event_pool_desc;
-    event_pool_desc.count = 1 + add_event_count; // at least one event to track progress
-    global_data::get().ze_cache->get(worker_idx, context, event_pool_desc, &event_pool);
-    LOG_DEBUG("get event pool: { max event count: ", event_pool_desc.count, " }");
-
-    /* create event */
-    ze_event_desc_t event_desc = default_event_desc;
-    event_desc.signal = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-    event_desc.wait = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-    event_desc.index = 0;
-    ZE_CALL(zeEventCreate, (event_pool, &event_desc, &entry_event));
-
-    is_initialized = true;
-}
-
-void ze_base_entry::finalize() {
-    if (!is_initialized) {
-        return;
-    }
-    ZE_CALL(zeEventDestroy, (entry_event));
-
-    /* event pool */
-    global_data::get().ze_cache->push(worker_idx, context, event_pool_desc, event_pool);
-
-    if (comp_primitives.list && comp_primitives.queue) {
-        LOG_DEBUG("push from cache for compute list and queue");
-        /* list */
-        global_data::get().ze_cache->push(
-            worker_idx, context, device, comp_primitives.list_desc, comp_primitives.list);
-
-        /* queue */
-        global_data::get().ze_cache->push(
-            worker_idx, context, device, comp_primitives.queue_desc, comp_primitives.queue);
-    }
-
-    if (copy_primitives.list && copy_primitives.queue) {
-        LOG_DEBUG("push from cache for copy list and queue");
-        /* copy list */
-        global_data::get().ze_cache->push(
-            worker_idx, context, device, copy_primitives.list_desc, copy_primitives.list);
-
-        /* copy queue */
-        global_data::get().ze_cache->push(
-            worker_idx, context, device, copy_primitives.queue_desc, copy_primitives.queue);
-    }
-
-    is_initialized = false;
-}
-
-void ze_base_entry::start() {
-    CCL_THROW_IF_NOT(entry_event, "no entry event");
-    ZE_CALL(zeEventHostReset, (entry_event));
-
-    if (comp_primitives.list && comp_primitives.queue) {
-        LOG_DEBUG("execute compute command list");
-        ZE_CALL(zeCommandQueueExecuteCommandLists,
-                (comp_primitives.queue, 1, &comp_primitives.list, nullptr));
-    }
-
-    if (copy_primitives.list && copy_primitives.queue) {
-        LOG_DEBUG("execute copy command list");
-        ZE_CALL(zeCommandQueueExecuteCommandLists,
-                (copy_primitives.queue, 1, &copy_primitives.list, nullptr));
-    }
-
-    if (((global_data::env().ze_serialize_mode & ze_call::serialize_mode::block)) != 0) {
-        LOG_DEBUG("wait until command lists are executed");
-        if (copy_primitives.queue)
-            ZE_CALL(zeHostSynchronize, (copy_primitives.queue));
-        if (comp_primitives.queue)
-            ZE_CALL(zeHostSynchronize, (comp_primitives.queue));
-    }
-}
-
-void ze_base_entry::update() {
-    ze_result_t query_status;
-
-    if (global_data::env().kernel_debug == 0) {
-        query_status = zeEventQueryStatus(entry_event);
-    }
-    else {
-        if (copy_primitives.queue)
-            query_status = zeHostSynchronize(copy_primitives.queue);
-        if (comp_primitives.queue)
-            query_status = zeHostSynchronize(comp_primitives.queue);
-    }
-
-    if (query_status == ZE_RESULT_SUCCESS) {
-        LOG_DEBUG("command list complete");
-        status = ccl_sched_entry_status_complete;
-    }
-    else if (query_status == ZE_RESULT_NOT_READY) {
-        // just return in case if the kernel is not ready yet, will check again on the next iteration
-        return;
-    }
-    else {
-        CCL_THROW("error at zeEventQueryStatus");
-    }
-}
-
-ze_command_list_handle_t ze_base_entry::get_copy_list() {
-    ze_command_list_handle_t list = nullptr;
-    if (copy_primitives.list) {
-        list = copy_primitives.list;
-        LOG_DEBUG("copy list is returned");
-    }
-    else {
-        list = comp_primitives.list;
-        LOG_DEBUG("compute list is returned");
-    }
-    CCL_THROW_IF_NOT(list, "command list is invalid");
-    return list;
-}
-
-void ze_base_entry::get_comp_primitives(const ze_queue_properties_t &queue_props,
-                                        cmd_primitives &comp_primitives) {
-    uint32_t ordinal, queue_index;
-    get_comp_queue_ordinal(device, queue_props, &ordinal);
-    get_queue_index(queue_props, ordinal, comm_rank, &queue_index);
-
-    comp_primitives.queue_desc.ordinal = ordinal;
-    comp_primitives.queue_desc.index = queue_index;
-    comp_primitives.list_desc.commandQueueGroupOrdinal = ordinal;
-}
-
-void ze_base_entry::get_copy_primitives(const ze_queue_properties_t &queue_props,
-                                        cmd_primitives &copy_primitives,
-                                        init_mode ze_init_mode) {
-    uint32_t ordinal, queue_index;
-    get_copy_queue_ordinal(device, queue_props, &ordinal);
-
-    // TODO: index depends on rank's changing, when > 1 queues are created,
-    // the index is still the same for different queues, that's the issue.
-    // WA is adding optional counter, which says the order number of a queue.
-    // Need to think, how we'd calculate the index for every queue.
-    // Hang in case of CCL_KERNEL_1S_USE_COPY_OPS=1 CCL_ZE_COPY_ENGINE=none
-    if (ze_init_mode == (init_mode::copy | init_mode::compute)) {
-        get_queue_index(queue_props, ordinal, comm_rank + 1, &queue_index);
-    }
-    else {
-        get_queue_index(queue_props, ordinal, comm_rank, &queue_index);
-    }
-
-    copy_primitives.queue_desc.ordinal = ordinal;
-    copy_primitives.queue_desc.index = queue_index;
-    copy_primitives.list_desc.commandQueueGroupOrdinal = ordinal;
-}
-
-void ze_base_entry::init_primitives(cmd_primitives &cmd_primitives) {
-    global_data::get().ze_cache->get(
-        worker_idx, context, device, cmd_primitives.queue_desc, &cmd_primitives.queue);
-    LOG_DEBUG("get queue: { ordinal: ",
-              cmd_primitives.queue_desc.ordinal,
-              ", index: ",
-              cmd_primitives.queue_desc.index,
-              " }");
-
-    global_data::get().ze_cache->get(
-        worker_idx, context, device, cmd_primitives.list_desc, &cmd_primitives.list);
-    LOG_DEBUG("get list: { ordinal: ", cmd_primitives.list_desc.commandQueueGroupOrdinal, " }");
-}
diff --git a/src/sched/entry/gpu/ze_reduce_entry.cpp b/src/sched/entry/gpu/ze_reduce_entry.cpp
deleted file mode 100644
index 066442898..000000000
--- a/src/sched/entry/gpu/ze_reduce_entry.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/comm/l0/modules/kernel_utils.hpp"
-#include "common/stream/stream.hpp"
-#include "sched/entry/gpu/ze_cache.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
-#include "sched/entry/gpu/ze_reduce_entry.hpp"
-#include "sched/queue/queue.hpp"
-
-#include <string>
-
-using namespace ccl;
-using namespace ccl::ze;
-
-ze_reduce_entry::ze_reduce_entry(ccl_sched* sched,
-                                 ccl_buffer send_buf,
-                                 ccl_buffer recv_buf,
-                                 size_t cnt,
-                                 const ccl_datatype& dtype,
-                                 reduction op,
-                                 int root,
-                                 ccl_comm* comm)
-        : ze_base_entry(sched, comm, 2 /* request additional events */),
-          send_buf(send_buf),
-          recv_buf(recv_buf),
-          cnt(cnt),
-          dtype(dtype),
-          op(op),
-          root(root),
-          buf_size_bytes(dtype.size() * cnt),
-          is_initialized(false),
-          empty_kernel_event(nullptr),
-          empty_kernel(nullptr),
-          empty_kernel_name("empty_kernel") {}
-
-ze_reduce_entry::~ze_reduce_entry() {
-    finalize();
-}
-
-void ze_reduce_entry::init() {
-    if (is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("initialization");
-
-    init_mode init_mode_type;
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        init_mode_type = (init_mode::copy | init_mode::compute);
-    }
-    else {
-        init_mode_type = init_mode::compute;
-    }
-
-    CCL_THROW_IF_NOT(comm_rank == root, "unexpected comm_rank ", comm_rank, ", expected ", root);
-
-    ze_base_entry::init(init_mode_type);
-
-    /* create kernels */
-    ccl_buffer right_send_buf;
-    int peer_rank = (comm_rank + 1) % comm_size;
-    sched->get_memory().handle_manager.get(peer_rank, 0, right_send_buf, comm);
-    LOG_DEBUG(
-        "get IPC pointers from ", peer_rank, " by ", root, ", right_send_buf: ", right_send_buf);
-
-    send_buf_ptr = send_buf.get_ptr();
-    recv_buf_ptr = recv_buf.get_ptr();
-    // TODO: in place case check! diff idx for handle_mngr
-
-    right_send_buf_ptr = right_send_buf.get_ptr();
-
-    ze_kernel_args_t reduce_local_kernel_args = { { sizeof(comm_rank), &comm_rank },
-                                                  { sizeof(comm_size), &comm_size },
-                                                  { sizeof(cnt), &cnt },
-                                                  { sizeof(send_buf_ptr), &send_buf_ptr },
-                                                  { sizeof(tmp_buf_ptr), &tmp_buf_ptr },
-                                                  { sizeof(recv_buf_ptr), &recv_buf_ptr } };
-
-    ccl::global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
-
-    device_mem_alloc_desc = default_device_mem_alloc_desc;
-    ccl::global_data::get().ze_cache->get(worker_idx,
-                                          context,
-                                          device,
-                                          device_mem_alloc_desc,
-                                          buf_size_bytes,
-                                          0, /*alignment*/
-                                          &tmp_buf_ptr);
-
-    main_kernel_name =
-        "reduce_local_outofplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
-    LOG_DEBUG("get kernel: name: ", main_kernel_name);
-    ccl::global_data::get().ze_cache->get(worker_idx, module, main_kernel_name, &main_kernel);
-
-    auto& main_kernel_args = reduce_local_kernel_args;
-    LOG_DEBUG("kernel ", main_kernel, " args:\n", to_string(main_kernel_args));
-    set_kernel_args(main_kernel, main_kernel_args);
-
-    ze_group_size_t group_size;
-    get_suggested_group_size(main_kernel, cnt, &group_size);
-    LOG_DEBUG("suggested group size: ", to_string(group_size));
-
-    get_suggested_group_count(group_size, cnt, &group_count);
-    LOG_DEBUG("suggested group count: ", to_string(group_count));
-
-    ZE_CALL(zeKernelSetGroupSize,
-            (main_kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
-
-    if (ccl::global_data::env().enable_kernel_1s_ipc_wa) {
-        LOG_DEBUG("get kernel: name: ", empty_kernel_name);
-        ccl::global_data::get().ze_cache->get(worker_idx, module, empty_kernel_name, &empty_kernel);
-        CCL_THROW_IF_NOT(empty_kernel, "null empty_kernel");
-        /* use allreduce_kernel_args since they have pointers to peer mem */
-        set_kernel_args(empty_kernel, main_kernel_args);
-    }
-
-    ze_event_desc_t event_desc = default_event_desc;
-    event_desc.signal = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-    event_desc.wait = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-
-    uint32_t last_event_idx = 1; // 0 is used to track entry progress
-
-    if (empty_kernel) {
-        LOG_DEBUG("create event for empty kernel");
-        event_desc.index = last_event_idx++;
-        ZE_CALL(zeEventCreate, (event_pool, &event_desc, &empty_kernel_event));
-    }
-
-    event_desc.index = last_event_idx++;
-    ZE_CALL(zeEventCreate, (event_pool, &event_desc, &copy_from_peer_event));
-
-    LOG_DEBUG("real event count: ", last_event_idx);
-
-    /* do appends */
-    if (empty_kernel) {
-        LOG_DEBUG("append empty kernel");
-        ze_group_count_t empty_group_count = { 1, 1, 1 };
-        ZE_CALL(zeCommandListAppendLaunchKernel,
-                (comp_primitives.list,
-                 empty_kernel,
-                 &empty_group_count,
-                 empty_kernel_event,
-                 0,
-                 nullptr));
-    }
-
-    LOG_DEBUG("one-sided multi-phase algorithm");
-
-    ZE_CALL(zeCommandListAppendMemoryCopy,
-            (ze_base_entry::get_copy_list(),
-             tmp_buf_ptr,
-             right_send_buf_ptr,
-             buf_size_bytes,
-             copy_from_peer_event,
-             (empty_kernel_event) ? 1 : 0,
-             &empty_kernel_event));
-
-    ZE_CALL(
-        zeCommandListAppendLaunchKernel,
-        (comp_primitives.list, main_kernel, &group_count, entry_event, 1, &copy_from_peer_event));
-
-    ZE_CALL(zeCommandListClose, (comp_primitives.list));
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        ZE_CALL(zeCommandListClose, (ze_base_entry::copy_primitives.list));
-    }
-
-    is_initialized = true;
-
-    LOG_DEBUG("initialization complete");
-}
-
-void ze_reduce_entry::start() {
-    init();
-
-    if (is_initialized && status == ccl_sched_entry_status_not_started) {
-        reset_sync_objects();
-    }
-
-    size_t kernel_counter = 0;
-    if (ccl::global_data::env().enable_kernel_sync) {
-        kernel_counter = ccl::global_data::get().kernel_counter++;
-    }
-
-    if (kernel_counter == 0) {
-        ze_base_entry::start();
-        status = ccl_sched_entry_status_started;
-    }
-    else {
-        ccl::global_data::get().kernel_counter--;
-        status = ccl_sched_entry_status_again;
-    }
-}
-
-void ze_reduce_entry::update() {
-    ze_base_entry::update();
-    if (status == ccl_sched_entry_status_complete && !sched->coll_attr.to_cache) {
-        finalize();
-    }
-
-    if (ccl::global_data::env().enable_kernel_sync && ccl::global_data::get().kernel_counter > 0) {
-        ccl::global_data::get().kernel_counter--;
-    }
-}
-
-void ze_reduce_entry::finalize() {
-    if (!is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("finalization");
-
-    /* events */
-    LOG_DEBUG("copy event finalization");
-    ZE_CALL(zeEventDestroy, (copy_from_peer_event));
-    /* device mem */
-    ccl::global_data::get().ze_cache->push(worker_idx,
-                                           context,
-                                           device,
-                                           device_mem_alloc_desc,
-                                           buf_size_bytes,
-                                           0, /*alignment*/
-                                           tmp_buf_ptr);
-
-    /* kernels */
-    if (empty_kernel_event) {
-        ZE_CALL(zeEventDestroy, (empty_kernel_event));
-        ccl::global_data::get().ze_cache->push(worker_idx, module, empty_kernel_name, empty_kernel);
-    }
-    ccl::global_data::get().ze_cache->push(worker_idx, module, main_kernel_name, main_kernel);
-
-    ze_base_entry::finalize();
-
-    is_initialized = false;
-
-    LOG_DEBUG("finalization complete");
-}
-
-void ze_reduce_entry::reset_sync_objects() {
-    if (empty_kernel_event) {
-        ZE_CALL(zeEventHostReset, (empty_kernel_event));
-    }
-    ZE_CALL(zeEventHostReset, (copy_from_peer_event));
-}
diff --git a/src/sched/entry/probe_entry.hpp b/src/sched/entry/probe_entry.hpp
index 972e680d4..2f4112045 100644
--- a/src/sched/entry/probe_entry.hpp
+++ b/src/sched/entry/probe_entry.hpp
@@ -33,9 +33,8 @@ class probe_entry : public sched_entry {
               comm(comm) {}
 
     void start() override {
-        int global_src = comm->get_global_rank(src);
-        atl_tag = comm->atl->tag->create(
-            global_src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
+        atl_tag = comm->get_atl_comm()->tag->create(
+            src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
         LOG_DEBUG("PROBE entry src ", src, ", tag ", atl_tag);
         status = ccl_sched_entry_status_started;
     }
@@ -44,10 +43,8 @@ class probe_entry : public sched_entry {
         int found = 0;
         size_t len = 0;
 
-        int global_src = comm->get_global_rank(src);
-
         atl_status_t atl_status =
-            comm->atl->atl_ep_probe(sched->bin->get_atl_ep(), global_src, atl_tag, &found, &len);
+            comm->get_atl_comm()->probe(sched->bin->get_atl_ep(), src, atl_tag, &found, &len);
 
         update_status(atl_status);
 
diff --git a/src/sched/entry/recv_copy_entry.cpp b/src/sched/entry/recv_copy_entry.cpp
new file mode 100644
index 000000000..8aef87a74
--- /dev/null
+++ b/src/sched/entry/recv_copy_entry.cpp
@@ -0,0 +1,58 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "comp/comp.hpp"
+#include "sched/entry/entry.hpp"
+#include "sched/entry/recv_copy_entry.hpp"
+#include "sched/queue/queue.hpp"
+
+void recv_copy_entry::start() {
+    atl_tag = comm->get_atl_comm()->tag->create(
+        src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
+    LOG_DEBUG("starting RECV in RECV_COPY entry, src ",
+              src,
+              ", tag ",
+              atl_tag,
+              ", req ",
+              &req,
+              ", bytes ",
+              bytes);
+
+    atl_status_t atl_status = comm->get_atl_comm()->recv(
+        sched->bin->get_atl_ep(), recv_buf.get_ptr(bytes), bytes, src, atl_tag, &req);
+
+    update_status(atl_status);
+}
+
+void recv_copy_entry::update() {
+    atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
+
+    if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
+        CCL_THROW("RECV_COPY entry failed. atl_status: ", atl_status_to_str(atl_status));
+    }
+
+    if (!req.is_completed) {
+        return;
+    }
+
+    LOG_DEBUG("completed RECV in RECV_COPY entry, req=", &req, ", starting COPY");
+
+    auto comp_status = ccl_comp_copy(
+        recv_buf.get_ptr(bytes), copy_buf.get_ptr(bytes), bytes, attr.use_nontemporal);
+    CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
+
+    status = ccl_sched_entry_status_complete;
+    LOG_DEBUG("completed COPY in RECV_COPY entry");
+}
diff --git a/src/sched/entry/recv_copy_entry.hpp b/src/sched/entry/recv_copy_entry.hpp
new file mode 100644
index 000000000..4c4ed653d
--- /dev/null
+++ b/src/sched/entry/recv_copy_entry.hpp
@@ -0,0 +1,80 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/copy/copy_helper.hpp"
+#include "sched/entry/entry.hpp"
+
+class recv_copy_entry final : public sched_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "RECV_COPY";
+    }
+
+    recv_copy_entry() = delete;
+    recv_copy_entry(ccl_sched* sched,
+                    ccl_buffer recv_buf,
+                    ccl_buffer copy_buf,
+                    size_t bytes,
+                    int src,
+                    ccl_comm* comm,
+                    copy_attr attr)
+            : sched_entry(sched),
+              recv_buf(recv_buf),
+              copy_buf(copy_buf),
+              bytes(bytes),
+              src(src),
+              comm(comm),
+              attr(attr) {}
+
+    void start() override;
+    void update() override;
+
+    const char* name() const override {
+        return class_name();
+    }
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str,
+                           ", recv_buf ",
+                           recv_buf,
+                           ", copy_buf ",
+                           copy_buf,
+                           ", bytes ",
+                           bytes,
+                           ", src ",
+                           src,
+                           ", atl_tag ",
+                           atl_tag,
+                           ", comm_id ",
+                           sched->get_comm_id(),
+                           ", req ",
+                           &req,
+                           "\n");
+    }
+
+private:
+    ccl_buffer recv_buf;
+    ccl_buffer copy_buf;
+    size_t bytes;
+    int src;
+    ccl_comm* comm;
+    copy_attr attr;
+
+    uint64_t atl_tag = 0;
+    atl_req_t req{};
+};
diff --git a/src/sched/entry/recv_entry.hpp b/src/sched/entry/recv_entry.hpp
index 4172f5e96..2346bc1a9 100644
--- a/src/sched/entry/recv_entry.hpp
+++ b/src/sched/entry/recv_entry.hpp
@@ -46,37 +46,33 @@ class recv_entry : public sched_entry,
         if (status == ccl_sched_entry_status_started) {
             size_t bytes = cnt * dtype.size();
             LOG_DEBUG("cancel RECV entry src ", src, ", req ", &req, ", bytes ", bytes);
-            comm->atl->atl_ep_cancel(sched->bin->get_atl_ep(), &req);
+            comm->get_atl_comm()->cancel(sched->bin->get_atl_ep(), &req);
         }
     }
 
     void start() override {
         update_fields();
 
-        int global_src = comm->get_global_rank(src);
-        atl_tag = comm->atl->tag->create(
-            global_src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
+        atl_tag = comm->get_atl_comm()->tag->create(
+            src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = cnt * dtype.size();
 
-        LOG_DEBUG(
-            "RECV entry src ", global_src, ", tag ", atl_tag, ", req ", &req, ", bytes ", bytes);
+        LOG_DEBUG("RECV entry src ", src, ", tag ", atl_tag, ", req ", &req, ", bytes ", bytes);
 
-        atl_status_t atl_status = comm->atl->atl_ep_recv(
-            sched->bin->get_atl_ep(), buf.get_ptr(bytes), bytes, global_src, atl_tag, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->recv(
+            sched->bin->get_atl_ep(), buf.get_ptr(bytes), bytes, src, atl_tag, &req);
 
         update_status(atl_status);
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("RECV entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             LOG_DEBUG("RECV entry done, src ", src);
             status = ccl_sched_entry_status_complete;
         }
diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp
index 0239d9d32..bcad0e99c 100644
--- a/src/sched/entry/recv_reduce_entry.hpp
+++ b/src/sched/entry/recv_reduce_entry.hpp
@@ -34,36 +34,36 @@ class recv_reduce_entry final : public sched_entry {
     recv_reduce_entry(ccl_sched* sched,
                       ccl_buffer inout_buf,
                       size_t cnt,
-                      size_t* out_cnt,
                       const ccl_datatype& dtype,
                       ccl::reduction reduction_op,
                       int src,
-                      ccl_buffer comm_buf,
                       ccl_comm* comm,
+                      ccl_buffer comm_buf = ccl_buffer(),
                       ccl_recv_reduce_result_buf_type result_buf_type = ccl_recv_reduce_local_buf)
             : sched_entry(sched),
               inout_buf(inout_buf),
               in_cnt(cnt),
-              out_cnt(out_cnt),
               dtype(dtype),
               op(reduction_op),
               src(src),
-              comm_buf(comm_buf),
               comm(comm),
+              comm_buf(comm_buf),
               result_buf_type(result_buf_type),
               fn(sched->coll_attr.reduction_fn) {
-        CCL_ASSERT(op != ccl::reduction::custom || fn,
-                   "custom reduction requires user provided callback");
-
-        CCL_ASSERT(
+        CCL_THROW_IF_NOT(op != ccl::reduction::custom || fn,
+                         "custom reduction requires user provided callback",
+                         ", op ",
+                         ccl_reduction_to_str(op),
+                         ", fn ",
+                         fn);
+
+        CCL_THROW_IF_NOT(
             (result_buf_type == ccl_recv_reduce_local_buf && inout_buf.get_ptr() != nullptr) ||
                 (result_buf_type == ccl_recv_reduce_comm_buf && comm_buf.get_ptr() != nullptr),
             "result buffer should be non null");
 
-        if (comm_buf.get_ptr() == nullptr || comm_buf == inout_buf) {
-            size_t comm_buf_size = in_cnt * dtype.size();
-            this->comm_buf.set(CCL_MALLOC(comm_buf_size, "recv_reduce.comm_buf"), comm_buf_size);
-            own_comm_buff = true;
+        if ((comm_buf.get_ptr() == nullptr || comm_buf == inout_buf) && in_cnt) {
+            this->comm_buf = sched->alloc_buffer({ in_cnt * dtype.size(), inout_buf });
         }
     }
 
@@ -72,21 +72,16 @@ class recv_reduce_entry final : public sched_entry {
             size_t bytes = in_cnt * dtype.size();
             LOG_DEBUG(
                 "cancel RECV in RECV_REDUCE entry, src ", src, ", req ", &req, ", bytes", bytes);
-            comm->atl->atl_ep_cancel(sched->bin->get_atl_ep(), &req);
-        }
-
-        if (own_comm_buff) {
-            CCL_FREE(comm_buf.get_ptr());
+            comm->get_atl_comm()->cancel(sched->bin->get_atl_ep(), &req);
         }
     }
 
     void start() override {
-        int global_src = comm->get_global_rank(src);
-        atl_tag = comm->atl->tag->create(
-            global_src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
+        atl_tag = comm->get_atl_comm()->tag->create(
+            src, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = in_cnt * dtype.size();
         LOG_DEBUG("starting RECV in RECV_REDUCE entry, src ",
-                  global_src,
+                  src,
                   ", tag ",
                   atl_tag,
                   ", req ",
@@ -94,48 +89,48 @@ class recv_reduce_entry final : public sched_entry {
                   ", bytes ",
                   bytes);
 
-        atl_status_t atl_status = comm->atl->atl_ep_recv(
-            sched->bin->get_atl_ep(), comm_buf.get_ptr(bytes), bytes, global_src, atl_tag, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->recv(
+            sched->bin->get_atl_ep(), comm_buf.get_ptr(bytes), bytes, src, atl_tag, &req);
 
         update_status(atl_status);
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("RECV_REDUCE entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
-            LOG_DEBUG("completed RECV in RECV_REDUCE entry, req=", &req, ", starting REDUCE");
-            size_t bytes = in_cnt * dtype.size();
-            size_t offset = inout_buf.get_offset();
+        if (!req.is_completed) {
+            return;
+        }
+
+        LOG_DEBUG("completed RECV in RECV_REDUCE entry, req=", &req, ", starting REDUCE");
+        size_t bytes = in_cnt * dtype.size();
+        size_t offset = inout_buf.get_offset();
 
-            const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset };
+        const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset };
 
-            ccl_buffer reduce_in_buf =
-                (result_buf_type == ccl_recv_reduce_local_buf) ? comm_buf : inout_buf;
+        ccl_buffer reduce_in_buf =
+            (result_buf_type == ccl_recv_reduce_local_buf) ? comm_buf : inout_buf;
 
-            ccl_buffer reduce_inout_buf =
-                (result_buf_type == ccl_recv_reduce_local_buf) ? inout_buf : comm_buf;
+        ccl_buffer reduce_inout_buf =
+            (result_buf_type == ccl_recv_reduce_local_buf) ? inout_buf : comm_buf;
 
-            ccl::status comp_status = ccl_comp_reduce(sched,
-                                                      reduce_in_buf.get_ptr(bytes),
-                                                      in_cnt,
-                                                      reduce_inout_buf.get_ptr(bytes),
-                                                      out_cnt,
-                                                      dtype,
-                                                      op,
-                                                      fn,
-                                                      &context);
+        ccl::status comp_status = ccl_comp_reduce(sched,
+                                                  reduce_in_buf.get_ptr(bytes),
+                                                  in_cnt,
+                                                  reduce_inout_buf.get_ptr(bytes),
+                                                  nullptr, /* out_count */
+                                                  dtype,
+                                                  op,
+                                                  fn,
+                                                  &context);
 
-            CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
-            status = ccl_sched_entry_status_complete;
-            LOG_DEBUG("completed REDUCE in RECV_REDUCE entry");
-        }
+        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
+        status = ccl_sched_entry_status_complete;
+        LOG_DEBUG("completed REDUCE in RECV_REDUCE entry");
     }
 
     const char* name() const override {
@@ -151,20 +146,18 @@ class recv_reduce_entry final : public sched_entry {
                            inout_buf,
                            ", in_cnt ",
                            in_cnt,
-                           ", out_cnt ",
-                           out_cnt,
                            ", op ",
                            ccl_reduction_to_str(op),
                            ", red_fn  ",
                            fn,
                            ", src ",
                            src,
-                           ", comm_buf ",
-                           comm_buf,
                            ", atl_tag ",
                            atl_tag,
                            ", comm_id ",
                            sched->get_comm_id(),
+                           ", comm_buf ",
+                           comm_buf,
                            ", result_buf_type ",
                            result_buf_type,
                            ", req ",
@@ -175,13 +168,11 @@ class recv_reduce_entry final : public sched_entry {
 private:
     ccl_buffer inout_buf;
     size_t in_cnt;
-    size_t* out_cnt;
     ccl_datatype dtype;
     ccl::reduction op;
     int src;
-    ccl_buffer comm_buf;
     ccl_comm* comm;
-    bool own_comm_buff = false;
+    ccl_buffer comm_buf;
     ccl_recv_reduce_result_buf_type result_buf_type;
     uint64_t atl_tag = 0;
     ccl::reduction_fn fn;
diff --git a/src/sched/entry/reduce_local_entry.cpp b/src/sched/entry/reduce_local_entry.cpp
index b5f52d77a..658138b4f 100644
--- a/src/sched/entry/reduce_local_entry.cpp
+++ b/src/sched/entry/reduce_local_entry.cpp
@@ -13,89 +13,35 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/reduce_local_entry.hpp"
-
-#include "common/comm/l0/modules/kernel_utils.hpp"
+#include "comp/comp.hpp"
 #include "common/datatype/datatype.hpp"
 #include "common/stream/stream.hpp"
-#include "common/utils/sycl_utils.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
-#include "sched/entry/gpu/ze_cache.hpp"
+#include "sched/entry/reduce_local_entry.hpp"
 #include "sched/queue/queue.hpp"
 
-#include <string>
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "common/utils/sycl_utils.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 using namespace ccl;
-using namespace ccl::ze;
-
-void reduce_local_entry::init() {
-    if (ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("initialization");
-
-    ze_base_entry::init(init_mode::compute);
-
-    ccl::global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
-
-    kernel_name =
-        "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
-    ccl::global_data::get().ze_cache->get(worker_idx, module, kernel_name, &kernel);
-    LOG_DEBUG("get kernel: name: ", kernel_name);
-
-    ze_group_size_t group_size;
-    get_suggested_group_size(kernel, in_cnt, &group_size);
-    LOG_DEBUG("suggested group size: ", to_string(group_size));
-
-    get_suggested_group_count(group_size, in_cnt, &group_count);
-    LOG_DEBUG("suggested group count: ", to_string(group_count));
-
-    ZE_CALL(zeKernelSetGroupSize,
-            (kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
-
-    size_t bytes = in_cnt * dtype.size();
-    in_buf_ptr = in_buf.get_ptr(bytes);
-    inout_buf_ptr = inout_buf.get_ptr(bytes);
-    ze_kernel_args_t kernel_args = { { sizeof(in_cnt), &in_cnt },
-                                     { sizeof(in_buf_ptr), &in_buf_ptr },
-                                     { sizeof(inout_buf_ptr), &inout_buf_ptr } };
 
-    LOG_DEBUG("kernel ", kernel, " args:\n", to_string(kernel_args));
-    set_kernel_args(kernel, kernel_args);
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
-    ZE_CALL(zeCommandListAppendLaunchKernel,
-            (ze_base_entry::comp_primitives.list,
-             kernel,
-             &group_count,
-             ze_base_entry::entry_event,
-             0,
-             nullptr));
-    ZE_CALL(zeCommandListClose, (ze_base_entry::comp_primitives.list));
-
-    LOG_DEBUG("initialization complete");
-}
-
-void reduce_local_entry::update() {
-    CCL_THROW_IF_NOT(use_device);
-
-    ze_base_entry::update();
-    if (status == ccl_sched_entry_status_complete && !sched->coll_attr.to_cache) {
-        finalize();
-    }
-}
+using namespace ccl::ze;
 
 void reduce_local_entry::check_use_device() {
     use_device = false;
-    ccl_stream* stream = (ccl_stream*)sched->coll_param.stream;
-    if (fn || !stream)
+    ccl_stream* stream = sched->coll_param.stream;
+    if (fn || !stream) {
         return;
+    }
 
     size_t bytes = in_cnt * dtype.size();
-    sycl::queue* q = stream->get_native_stream(sched->queue->get_idx());
-    CCL_THROW_IF_NOT(q, "null sycl queue");
-    auto in_ptr_type = sycl::get_pointer_type(in_buf.get_ptr(bytes), q->get_context());
-    auto inout_ptr_type = sycl::get_pointer_type(inout_buf.get_ptr(bytes), q->get_context());
+    auto sycl_stream = stream->get_native_stream(worker_idx);
+    CCL_THROW_IF_NOT(sycl_stream, "null sycl queue");
+    auto in_ptr_type = sycl::get_pointer_type(in_buf.get_ptr(bytes), sycl_stream->get_context());
+    auto inout_ptr_type =
+        sycl::get_pointer_type(inout_buf.get_ptr(bytes), sycl_stream->get_context());
 
     LOG_DEBUG("in_ptr_type: ",
               ccl::utils::usm_type_to_str(in_ptr_type),
@@ -112,23 +58,65 @@ void reduce_local_entry::check_use_device() {
 }
 
 void reduce_local_entry::start_on_device() {
-    init();
-
-    ze_base_entry::start();
-    status = ccl_sched_entry_status_started;
+    ze_reduce_local_entry::start();
 }
 
-void reduce_local_entry::finalize() {
-    if (!ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("finalization");
-
-    // kernel cache
-    ccl::global_data::get().ze_cache->push(worker_idx, module, kernel_name, kernel);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+reduce_local_entry::reduce_local_entry(ccl_sched* sched,
+                                       const ccl_buffer in_buf,
+                                       size_t in_cnt,
+                                       ccl_buffer inout_buf,
+                                       size_t* out_cnt,
+                                       const ccl_datatype& dtype,
+                                       ccl::reduction op)
+        :
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+          ze_reduce_local_entry(sched, in_buf, in_cnt, inout_buf, out_cnt, dtype, op),
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+          sched_entry(sched),
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+          in_buf(in_buf),
+          in_cnt(in_cnt),
+          inout_buf(inout_buf),
+          out_cnt(out_cnt),
+          dtype(dtype),
+          op(op),
+          fn(sched->coll_attr.reduction_fn) {
+    CCL_THROW_IF_NOT(op != ccl::reduction::custom || fn,
+                     "custom reduction requires user provided callback",
+                     ", op ",
+                     ccl_reduction_to_str(op),
+                     ", fn ",
+                     fn);
+}
 
-    ze_base_entry::finalize();
+void reduce_local_entry::start_on_host() {
+    size_t bytes = in_cnt * dtype.size();
+    size_t offset = inout_buf.get_offset();
+    const fn_context context = { sched->coll_attr.match_id.c_str(), offset };
+    ccl::status comp_status = ccl_comp_reduce(sched,
+                                              in_buf.get_ptr(bytes),
+                                              in_cnt,
+                                              inout_buf.get_ptr(bytes),
+                                              const_cast<size_t*>(out_cnt),
+                                              dtype,
+                                              op,
+                                              fn,
+                                              &context);
+    CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
+
+    status = ccl_sched_entry_status_complete;
+}
 
-    LOG_DEBUG("finalization complete");
+void reduce_local_entry::start() {
+    check_use_device();
+    if (use_device) {
+        LOG_DEBUG("start on device");
+        start_on_device();
+    }
+    else {
+        LOG_DEBUG("start on host");
+        start_on_host();
+    }
 }
diff --git a/src/sched/entry/reduce_local_entry.hpp b/src/sched/entry/reduce_local_entry.hpp
index 2a6686296..30d4012a2 100644
--- a/src/sched/entry/reduce_local_entry.hpp
+++ b/src/sched/entry/reduce_local_entry.hpp
@@ -14,96 +14,47 @@
  limitations under the License.
 */
 #pragma once
+
 #include "common/global/global.hpp"
-#include "comp/comp.hpp"
 #include "sched/entry/entry.hpp"
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-#include "sched/entry/gpu/ze_base_entry.hpp"
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "sched/entry/ze/ze_reduce_local_entry.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-class reduce_local_entry : public ze_base_entry {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+class reduce_local_entry : public ze_reduce_local_entry {
 #else
 class reduce_local_entry : public sched_entry {
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 public:
     static constexpr const char* class_name() noexcept {
         return "REDUCE_LOCAL";
     }
 
-    reduce_local_entry() = delete;
-    reduce_local_entry(ccl_sched* sched,
-                       const ccl_buffer in_buf,
-                       size_t in_cnt,
-                       ccl_buffer inout_buf,
-                       size_t* out_cnt,
-                       const ccl_datatype& dtype,
-                       ccl::reduction reduction_op)
-            :
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-              ze_base_entry(sched),
-#else // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
-              sched_entry(sched),
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
-              in_buf(in_buf),
-              in_cnt(in_cnt),
-              inout_buf(inout_buf),
-              out_cnt(out_cnt),
-              dtype(dtype),
-              op(reduction_op),
-              fn(sched->coll_attr.reduction_fn),
-              use_device(false) {
-        CCL_THROW_IF_NOT(op != ccl::reduction::custom || fn,
-                         "custom reduction requires user provided callback");
+    const char* name() const noexcept override {
+        return class_name();
     }
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    ~reduce_local_entry() override {
-        finalize();
-    }
-    void init();
-    void finalize();
-    void update() override;
+    reduce_local_entry() = delete;
+    explicit reduce_local_entry(ccl_sched* sched,
+                                const ccl_buffer in_buf,
+                                size_t in_cnt,
+                                ccl_buffer inout_buf,
+                                size_t* out_cnt,
+                                const ccl_datatype& dtype,
+                                ccl::reduction op);
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     void check_use_device();
     void start_on_device();
-#else // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     void check_use_device() {}
     void start_on_device() {}
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
-    void start_on_host() {
-        size_t bytes = in_cnt * dtype.size();
-        size_t offset = inout_buf.get_offset();
-        const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset };
-        ccl::status comp_status = ccl_comp_reduce(sched,
-                                                  in_buf.get_ptr(bytes),
-                                                  in_cnt,
-                                                  inout_buf.get_ptr(bytes),
-                                                  out_cnt,
-                                                  dtype,
-                                                  op,
-                                                  fn,
-                                                  &context);
-        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
-        status = ccl_sched_entry_status_complete;
-    }
-
-    void start() override {
-        check_use_device();
-        if (use_device) {
-            LOG_DEBUG("start on device");
-            start_on_device();
-        }
-        else {
-            LOG_DEBUG("start on host");
-            start_on_host();
-        }
-    }
-
-    const char* name() const noexcept override {
-        return class_name();
-    }
+    void start_on_host();
+    void start() override;
 
 protected:
     void dump_detail(std::stringstream& str) const override {
@@ -126,22 +77,13 @@ class reduce_local_entry : public sched_entry {
     }
 
 private:
-    ccl_buffer in_buf;
-    size_t in_cnt;
-    ccl_buffer inout_buf;
-    size_t* out_cnt;
-    ccl_datatype dtype;
-    ccl::reduction op;
-    ccl::reduction_fn fn;
-    void* in_buf_ptr;
-    void* inout_buf_ptr;
-
-    bool use_device;
+    const ccl_buffer in_buf;
+    const size_t in_cnt;
+    const ccl_buffer inout_buf;
+    const size_t* out_cnt;
+    const ccl_datatype dtype;
+    const ccl::reduction op;
+    const ccl::reduction_fn fn;
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    ze_module_handle_t module;
-    ze_kernel_handle_t kernel;
-    std::string kernel_name;
-    ze_group_count_t group_count;
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+    bool use_device{};
 };
diff --git a/src/sched/entry/register_entry.hpp b/src/sched/entry/register_entry.hpp
index 431cefdfc..b82865d2a 100644
--- a/src/sched/entry/register_entry.hpp
+++ b/src/sched/entry/register_entry.hpp
@@ -42,7 +42,7 @@ class register_entry : public sched_entry {
         CCL_THROW_IF_NOT(
             size > 0 && ptr && mr, "incorrect input, size ", size, ", ptr ", ptr, " mr ", mr);
 
-        atl_status_t atl_status = comm->atl->atl_mr_reg(ptr.get_ptr(size), size, mr);
+        atl_status_t atl_status = comm->get_atl_comm()->mr_reg(ptr.get_ptr(size), size, mr);
 
         sched->add_memory_region(*mr);
 
diff --git a/src/sched/entry/send_entry.hpp b/src/sched/entry/send_entry.hpp
index 5e3ddd6c2..c0d93bd26 100644
--- a/src/sched/entry/send_entry.hpp
+++ b/src/sched/entry/send_entry.hpp
@@ -44,18 +44,14 @@ class send_entry : public sched_entry,
               comm(comm) {}
 
     void start_send() {
-        int global_dst = comm->get_global_rank(dst);
-        int global_rank = comm->get_global_rank(comm->rank());
-
-        atl_tag = comm->atl->tag->create(
-            global_rank, sched->get_comm_id(), sched->sched_id, sched->get_op_id());
+        atl_tag = comm->get_atl_comm()->tag->create(
+            comm->rank(), sched->get_comm_id(), sched->sched_id, sched->get_op_id());
         size_t bytes = cnt * dtype.size();
 
-        LOG_DEBUG(
-            "SEND entry dst ", global_dst, ", tag ", atl_tag, ", req ", &req, ", bytes ", bytes);
+        LOG_DEBUG("SEND entry dst ", dst, ", tag ", atl_tag, ", req ", &req, ", bytes ", bytes);
 
-        atl_status_t atl_status = comm->atl->atl_ep_send(
-            sched->bin->get_atl_ep(), send_buf.get_ptr(bytes), bytes, global_dst, atl_tag, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->send(
+            sched->bin->get_atl_ep(), send_buf.get_ptr(bytes), bytes, dst, atl_tag, &req);
 
         update_status(atl_status);
     }
@@ -79,7 +75,7 @@ class send_entry : public sched_entry,
             (ccl::global_data::env().atl_send_proxy != ccl_atl_send_proxy_none) &&
             (proxy_mode == proxy_copy_mode::unknown)) {
             sycl::usm::alloc ptr_type = sycl::usm::alloc::unknown;
-            if (sched->coll_param.stream->get_type() == stream_type::gpu) {
+            if (sched->coll_param.stream->is_gpu()) {
                 auto sycl_queue = sched->coll_param.stream->get_native_stream();
                 ptr_type = sycl::get_pointer_type(buf.get_ptr(), sycl_queue.get_context());
             }
@@ -89,12 +85,15 @@ class send_entry : public sched_entry,
 
         if (proxy_mode == proxy_copy_mode::enabled) {
             if (!proxy_buf) {
-                ccl_sched_buf_type buf_type =
+                ccl::buffer_type buf_type =
                     (ccl::global_data::env().atl_send_proxy == ccl_atl_send_proxy_regular)
-                        ? ccl_sched_buf_system
-                        : ccl_sched_buf_runtime;
-                send_buf = proxy_buf = sched->alloc_buffer(cnt * dtype.size(), buf_type);
+                        ? ccl::buffer_type::regular
+                        : ccl::buffer_type::sycl;
+                ccl::alloc_param alloc_param(
+                    cnt * dtype.size(), buf_type, ccl::buffer_place::host, 1);
+                proxy_buf = sched->alloc_buffer(alloc_param);
             }
+
             if (!proxy_copy_entry) {
                 proxy_copy_entry =
                     std::shared_ptr<copy_entry>(new copy_entry(sched, buf, proxy_buf, cnt, dtype));
@@ -106,6 +105,8 @@ class send_entry : public sched_entry,
                 status = ccl_sched_entry_status_again;
                 return;
             }
+
+            send_buf = proxy_buf;
         }
 #endif // CCL_ENABLE_SYCL
 
@@ -113,16 +114,13 @@ class send_entry : public sched_entry,
     }
 
     void update() override {
-        int req_status;
-
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("SEND entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             LOG_DEBUG("SEND entry done, dst ", dst);
             status = ccl_sched_entry_status_complete;
         }
diff --git a/src/sched/entry/subsched_entry.hpp b/src/sched/entry/subsched_entry.hpp
index bcfee7697..47cf9a7b6 100644
--- a/src/sched/entry/subsched_entry.hpp
+++ b/src/sched/entry/subsched_entry.hpp
@@ -38,20 +38,36 @@ class subsched_entry : public sched_entry {
             LOG_DEBUG("subsched name: ", subsched_name);
         }
 
-        subsched.reset(new ccl_extra_sched(sched->coll_param, sched->sched_id));
-        subsched->coll_param.ctype = ccl_coll_internal;
+        subsched.reset(new ccl_extra_sched({ sched->sched_id, sched->coll_param }));
+
+        inherit_params(subsched.get(), sched, sched->coll_param.ctype);
+
+        subsched->coll_param.ctype = ccl_coll_undefined;
         subsched->set_op_id(this->op_id);
-        subsched->flow_control.set_max_credits(sched->flow_control.get_max_credits());
 
-        if (sched->coll_param.ctype == ccl_coll_allreduce ||
-            sched->coll_param.ctype == ccl_coll_reduce ||
-            sched->coll_param.ctype == ccl_coll_reduce_scatter) {
-            subsched->coll_attr.reduction_fn = sched->coll_attr.reduction_fn;
+        fill_fn(subsched.get());
+    }
+
+    static void inherit_params(ccl_sched* sched,
+                               const ccl_sched* parent_sched,
+                               ccl_coll_type ctype) {
+        if (sched == parent_sched) {
+            return;
+        }
+
+        if (ctype == ccl_coll_allreduce || ctype == ccl_coll_reduce ||
+            ctype == ccl_coll_reduce_scatter) {
+            sched->coll_attr.reduction_fn = parent_sched->coll_attr.reduction_fn;
             /* required to create ccl_fn_context in reduce/recv_reduce entries */
-            subsched->coll_attr.match_id = sched->coll_attr.match_id;
+            sched->coll_attr.match_id = parent_sched->coll_attr.match_id;
         }
+        sched->coll_attr.to_cache = parent_sched->coll_attr.to_cache;
 
-        fill_fn(subsched.get());
+#ifdef CCL_ENABLE_SYCL
+        sched->coll_attr.is_sycl_buf = parent_sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+
+        sched->flow_control.set_max_credits(parent_sched->flow_control.get_max_credits());
     }
 
     ~subsched_entry() {
diff --git a/src/sched/entry/write_entry.hpp b/src/sched/entry/write_entry.hpp
index 443e32350..3a039509a 100644
--- a/src/sched/entry/write_entry.hpp
+++ b/src/sched/entry/write_entry.hpp
@@ -50,7 +50,7 @@ class write_entry : public sched_entry,
     ~write_entry() {
         if (status == ccl_sched_entry_status_started) {
             LOG_DEBUG("cancel WRITE entry dst ", dst, ", req ", &req);
-            comm->atl->atl_ep_cancel(sched->bin->get_atl_ep(), &req);
+            comm->get_atl_comm()->cancel(sched->bin->get_atl_ep(), &req);
         }
     }
 
@@ -66,30 +66,26 @@ class write_entry : public sched_entry,
             return;
         }
 
-        int global_dst = comm->get_global_rank(dst);
-
         size_t bytes = cnt * dtype.size();
-        atl_status_t atl_status = comm->atl->atl_ep_write(sched->bin->get_atl_ep(),
-                                                          src_buf.get_ptr(bytes),
-                                                          bytes,
-                                                          src_mr,
-                                                          (uint64_t)dst_mr->buf + dst_buf_off,
-                                                          dst_mr->remote_key,
-                                                          global_dst,
-                                                          &req);
+        atl_status_t atl_status = comm->get_atl_comm()->write(sched->bin->get_atl_ep(),
+                                                              src_buf.get_ptr(bytes),
+                                                              bytes,
+                                                              src_mr,
+                                                              (uint64_t)dst_mr->buf + dst_buf_off,
+                                                              dst_mr->remote_key,
+                                                              dst,
+                                                              &req);
         update_status(atl_status);
     }
 
     void update() override {
-        int req_status;
-        atl_status_t atl_status =
-            comm->atl->atl_ep_check(sched->bin->get_atl_ep(), &req_status, &req);
+        atl_status_t atl_status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("WRITE entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req_status) {
+        if (req.is_completed) {
             LOG_DEBUG("WRITE entry done, dst ", dst);
             status = ccl_sched_entry_status_complete;
         }
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
new file mode 100644
index 000000000..15f2631f5
--- /dev/null
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
@@ -0,0 +1,176 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/stream/stream.hpp"
+#include "sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp"
+#include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
+#include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/queue/queue.hpp"
+
+#include <algorithm>
+#include <string>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_a2a_allreduce_entry::ze_a2a_allreduce_entry(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t cnt,
+                                               const ccl_datatype& dtype,
+                                               reduction op,
+                                               ccl_comm* comm,
+                                               std::vector<ze_event_handle_t> wait_events,
+                                               size_t send_buf_idx,
+                                               size_t recv_buf_idx)
+        : ze_base_entry(sched,
+                        (init_mode::compute | init_mode::copy),
+                        comm,
+                        comm->size() * event_group_count,
+                        wait_events),
+          send_buf(send_buf),
+          recv_buf(recv_buf),
+          cnt(cnt),
+          dtype(dtype),
+          op(op),
+          send_buf_idx(send_buf_idx),
+          recv_buf_idx(recv_buf_idx),
+          peer_count(comm->size() - 1) {
+    size_t segment_count = cnt / comm->size();
+    bool count_check =
+        (segment_count > 0) || (segment_count == 0 && static_cast<size_t>(comm->rank()) < cnt);
+    skip_entry = !count_check || ((comm->size() == 1) && (send_buf == recv_buf));
+    if (skip_entry) {
+        // skip entry init and finalize
+        sched->get_memory().ze_entries.pop_back();
+    }
+}
+
+void ze_a2a_allreduce_entry::init_ze_hook() {
+    /* get peer buffers */
+    std::vector<ccl_buffer> peer_send_bufs(peer_count);
+    std::vector<ccl_buffer> peer_recv_bufs(peer_count);
+
+    for (int i = 0; i < peer_count; ++i) {
+        int peer_rank = (comm_rank + i + 1) % comm->size();
+        sched->get_memory().handle_manager.get(peer_rank, send_buf_idx, peer_send_bufs[i], comm);
+        CCL_THROW_IF_NOT(peer_send_bufs[i].get_ptr(), "null IPC buffer is received");
+        sched->get_memory().handle_manager.get(peer_rank, recv_buf_idx, peer_recv_bufs[i], comm);
+        CCL_THROW_IF_NOT(peer_recv_bufs[i].get_ptr(), "null IPC buffer is received");
+    }
+
+    size_t main_block_count = cnt / comm_size;
+    if (main_block_count == 0 && static_cast<size_t>(comm_rank) < cnt) {
+        main_block_count = 1;
+    }
+
+    size_t block_count = main_block_count;
+    if (comm_rank == comm_size - 1) {
+        block_count += cnt - main_block_count * comm_size;
+    }
+
+    CCL_THROW_IF_NOT(main_block_count > 0, "wrong segment count");
+
+    /* alloc temp buffer */
+    size_t tmp_buf_bytes = peer_count * block_count * dtype.size();
+    ccl::alloc_param alloc_param(tmp_buf_bytes, buffer_type::ze, buffer_place::device);
+    void* tmp_buf = sched->alloc_buffer(alloc_param).get_ptr();
+
+    LOG_DEBUG("rank ",
+              comm_size,
+              ", main_block_count: ",
+              main_block_count,
+              ", block_count: ",
+              block_count,
+              ", tmp buf size: ",
+              tmp_buf_bytes,
+              ", cnt: ",
+              cnt);
+
+    /* copy peer segments to temp buffer */
+    size_t main_block_bytes = main_block_count * dtype.size();
+    size_t block_bytes = block_count * dtype.size();
+
+    pre_copy_events.resize(peer_count);
+    for (auto& event : pre_copy_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    kernel_events.resize(peer_count);
+    for (auto& event : kernel_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    barrier_event = ze_base_entry::create_event();
+
+    ze_a2a_reduce_scatter_entry::fill_list(ze_base_entry::get_copy_list(),
+                                           ze_base_entry::get_comp_list(),
+                                           send_buf.get_ptr(),
+                                           tmp_buf,
+                                           peer_send_bufs,
+                                           peer_count,
+                                           comm_rank,
+                                           block_count,
+                                           comm_rank * main_block_bytes,
+                                           pre_copy_events,
+                                           kernels,
+                                           kernel_events,
+                                           barrier_event,
+                                           dtype,
+                                           module,
+                                           device,
+                                           context,
+                                           op,
+                                           worker_idx);
+
+    post_copy_events.resize(comm_size);
+    for (auto& event : post_copy_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    ze_a2a_allgatherv_entry::fill_list(ze_base_entry::get_copy_list(),
+                                       tmp_buf,
+                                       recv_buf.get_ptr(),
+                                       peer_recv_bufs,
+                                       peer_count,
+                                       block_bytes,
+                                       comm_rank * main_block_bytes,
+                                       false,
+                                       post_copy_events,
+                                       kernel_events.back());
+}
+
+void ze_a2a_allreduce_entry::start() {
+    if (skip_entry) {
+        ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+        status = ccl_sched_entry_status_complete;
+        return;
+    }
+
+    ze_base_entry::start();
+}
+
+void ze_a2a_allreduce_entry::update() {
+    for (const auto& event : post_copy_events) {
+        if (!ze_base_entry::is_event_completed(event)) {
+            return;
+        }
+    }
+
+    ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+    ze_base_entry::update();
+}
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp
new file mode 100644
index 000000000..959977287
--- /dev/null
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp
@@ -0,0 +1,101 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/utils/buffer.hpp"
+#include "comp/comp.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+#include <atomic>
+#include <sstream>
+
+class ze_a2a_allreduce_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_A2A_ALLREDUCE";
+    }
+
+    const char* name() const noexcept override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << cnt;
+        return out.str();
+    }
+
+    ze_a2a_allreduce_entry() = delete;
+    explicit ze_a2a_allreduce_entry(ccl_sched* sched,
+                                    ccl_buffer send_buf,
+                                    ccl_buffer recv_buf,
+                                    size_t cnt,
+                                    const ccl_datatype& dtype,
+                                    ccl::reduction op,
+                                    ccl_comm* comm,
+                                    std::vector<ze_event_handle_t> wait_events = {},
+                                    size_t send_buf_idx = 0,
+                                    size_t recv_buf_idx = 1);
+
+    void init_ze_hook() override;
+
+    void start() override;
+    void update() override;
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str,
+                           "dt ",
+                           ccl::global_data::get().dtypes->name(dtype),
+                           ", cnt ",
+                           cnt,
+                           ", send_buf ",
+                           send_buf,
+                           ", recv_buf ",
+                           recv_buf,
+                           ", op ",
+                           ccl_reduction_to_str(op),
+                           ", comm_id ",
+                           sched->get_comm_id(),
+                           ", context ",
+                           context,
+                           "\n");
+    }
+
+private:
+    static constexpr size_t event_group_count{ 3 }; // copy + kernel + copy
+
+    const ccl_buffer send_buf;
+    const ccl_buffer recv_buf;
+    const size_t cnt;
+    const ccl_datatype dtype;
+    const ccl::reduction op;
+
+    const size_t send_buf_idx;
+    const size_t recv_buf_idx;
+
+    const int peer_count;
+
+    std::vector<ze_event_handle_t> pre_copy_events;
+    std::vector<ze_event_handle_t> post_copy_events;
+    ze_event_handle_t barrier_event{};
+
+    std::vector<ze_kernel> kernels;
+    std::vector<ze_event_handle_t> kernel_events;
+
+    bool skip_entry{};
+};
diff --git a/src/sched/entry/gpu/ze_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
similarity index 50%
rename from src/sched/entry/gpu/ze_allreduce_entry.cpp
rename to src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
index 822547c27..3c4fcc076 100644
--- a/src/sched/entry/gpu/ze_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
@@ -13,11 +13,10 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "common/comm/l0/modules/kernel_utils.hpp"
 #include "common/stream/stream.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
-#include "sched/entry/gpu/ze_cache.hpp"
-#include "sched/entry/gpu/ze_allreduce_entry.hpp"
+#include "sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
 #include "sched/queue/queue.hpp"
 
 #include <string>
@@ -25,14 +24,21 @@
 using namespace ccl;
 using namespace ccl::ze;
 
-ze_allreduce_entry::ze_allreduce_entry(ccl_sched* sched,
-                                       ccl_buffer send_buf,
-                                       ccl_buffer recv_buf,
-                                       size_t cnt,
-                                       const ccl_datatype& dtype,
-                                       reduction op,
-                                       ccl_comm* comm)
-        : ze_base_entry(sched, comm, local_events_count /* request additional events */),
+ze_onesided_allreduce_entry::ze_onesided_allreduce_entry(ccl_sched* sched,
+                                                         ccl_buffer send_buf,
+                                                         ccl_buffer recv_buf,
+                                                         size_t cnt,
+                                                         const ccl_datatype& dtype,
+                                                         reduction op,
+                                                         ccl_comm* comm,
+                                                         std::vector<ze_event_handle_t> wait_events)
+        : ze_base_entry(sched,
+                        global_data::env().enable_kernel_1s_copy_ops
+                            ? (init_mode::compute | init_mode::copy)
+                            : init_mode::compute,
+                        comm,
+                        3 /* request additional events */,
+                        wait_events),
           send_buf(send_buf),
           recv_buf(recv_buf),
           cnt(cnt),
@@ -40,27 +46,7 @@ ze_allreduce_entry::ze_allreduce_entry(ccl_sched* sched,
           op(op),
           buf_size_bytes(dtype.size() * cnt) {}
 
-ze_allreduce_entry::~ze_allreduce_entry() {
-    finalize();
-}
-
-void ze_allreduce_entry::init() {
-    if (ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("initialization");
-
-    init_mode init_mode_type;
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        init_mode_type = (init_mode::copy | init_mode::compute);
-    }
-    else {
-        init_mode_type = init_mode::compute;
-    }
-
-    ze_base_entry::init(init_mode_type);
-
+void ze_onesided_allreduce_entry::init_ze_hook() {
     /* create kernels */
     ccl_buffer right_send_buf;
     ccl_buffer right_recv_buf;
@@ -79,42 +65,29 @@ void ze_allreduce_entry::init() {
     right_send_buf_ptr = right_send_buf.get_ptr();
     right_recv_buf_ptr = right_recv_buf.get_ptr();
 
-    ze_kernel_args_t allreduce_kernel_args = { { sizeof(comm_rank), &comm_rank },
-                                               { sizeof(comm_size), &comm_size },
-                                               { sizeof(cnt), &cnt },
-                                               { sizeof(send_buf_ptr), &send_buf_ptr },
-                                               { sizeof(recv_buf_ptr), &recv_buf_ptr },
-                                               { sizeof(right_send_buf_ptr), &right_send_buf_ptr },
-                                               { sizeof(right_recv_buf_ptr),
-                                                 &right_recv_buf_ptr } };
-
-    ze_kernel_args_t reduce_local_kernel_args = { { sizeof(comm_rank), &comm_rank },
-                                                  { sizeof(comm_size), &comm_size },
-                                                  { sizeof(cnt), &cnt },
-                                                  { sizeof(send_buf_ptr), &send_buf_ptr },
-                                                  { sizeof(tmp_buf_ptr), &tmp_buf_ptr },
-                                                  { sizeof(recv_buf_ptr), &recv_buf_ptr } };
-
-    global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
+    void* tmp_buf_ptr{};
 
     if (global_data::env().enable_kernel_1s_copy_ops) {
         main_kernel_name = "reduce_local_outofplace_kernel_";
-        device_mem_alloc_desc = default_device_mem_alloc_desc;
-        global_data::get().ze_cache->get(worker_idx,
-                                         context,
-                                         device,
-                                         device_mem_alloc_desc,
-                                         buf_size_bytes,
-                                         0, /*alignment*/
-                                         &tmp_buf_ptr);
+        ccl::alloc_param alloc_param(buf_size_bytes, buffer_type::ze, buffer_place::device);
+        tmp_buf_ptr = sched->alloc_buffer(alloc_param).get_ptr();
     }
     else {
         main_kernel_name = "allreduce_kernel_";
     }
+
     main_kernel_name += to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
     LOG_DEBUG("get kernel: name: ", main_kernel_name);
+
+    global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
     global_data::get().ze_cache->get(worker_idx, module, main_kernel_name, &main_kernel);
 
+    ze_kernel_args_t allreduce_kernel_args{ &comm_rank,         &comm_size,    &cnt,
+                                            &send_buf_ptr,      &recv_buf_ptr, &right_send_buf_ptr,
+                                            &right_recv_buf_ptr };
+    ze_kernel_args_t reduce_local_kernel_args{ &comm_rank,    &comm_size,   &cnt,
+                                               &send_buf_ptr, &tmp_buf_ptr, &recv_buf_ptr };
+
     auto& main_kernel_args = (global_data::env().enable_kernel_1s_copy_ops)
                                  ? reduce_local_kernel_args
                                  : allreduce_kernel_args;
@@ -139,33 +112,21 @@ void ze_allreduce_entry::init() {
         set_kernel_args(empty_kernel, allreduce_kernel_args);
     }
 
-    ze_event_desc_t event_desc = default_event_desc;
-    event_desc.signal = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-    event_desc.wait = ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
-
-    uint32_t last_event_idx = 1; // 0 is used to track entry progress
-
     if (empty_kernel) {
-        LOG_DEBUG("create event for empty kernel");
-        event_desc.index = last_event_idx++;
-        ZE_CALL(zeEventCreate, (event_pool, &event_desc, &empty_kernel_event));
+        empty_kernel_event = ze_base_entry::create_event();
     }
 
     if (global_data::env().enable_kernel_1s_copy_ops) {
-        event_desc.index = last_event_idx++;
-        ZE_CALL(zeEventCreate, (event_pool, &event_desc, &copy_from_peer_event));
-        event_desc.index = last_event_idx++;
-        ZE_CALL(zeEventCreate, (event_pool, &event_desc, &reduce_local_kernel_event));
+        copy_from_peer_event = ze_base_entry::create_event();
+        reduce_local_kernel_event = ze_base_entry::create_event();
     }
 
-    LOG_DEBUG("real event count: ", last_event_idx);
-
     /* do appends */
     if (empty_kernel) {
         LOG_DEBUG("append empty kernel");
         ze_group_count_t empty_group_count = { 1, 1, 1 };
         ZE_CALL(zeCommandListAppendLaunchKernel,
-                (ze_base_entry::comp_primitives.list,
+                (ze_base_entry::get_comp_list(),
                  empty_kernel,
                  &empty_group_count,
                  empty_kernel_event,
@@ -186,7 +147,7 @@ void ze_allreduce_entry::init() {
                  &empty_kernel_event));
 
         ZE_CALL(zeCommandListAppendLaunchKernel,
-                (ze_base_entry::comp_primitives.list,
+                (ze_base_entry::get_comp_list(),
                  main_kernel,
                  &group_count,
                  reduce_local_kernel_event,
@@ -205,28 +166,23 @@ void ze_allreduce_entry::init() {
     else {
         LOG_DEBUG("one-sided monolithic algorithm");
         ZE_CALL(zeCommandListAppendLaunchKernel,
-                (ze_base_entry::comp_primitives.list,
+                (ze_base_entry::get_comp_list(),
                  main_kernel,
                  &group_count,
                  ze_base_entry::entry_event,
                  (empty_kernel_event) ? 1 : 0,
                  &empty_kernel_event));
     }
-
-    ZE_CALL(zeCommandListClose, (ze_base_entry::comp_primitives.list));
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        ZE_CALL(zeCommandListClose, (ze_base_entry::copy_primitives.list));
-    }
-    LOG_DEBUG("initialization complete");
 }
 
-void ze_allreduce_entry::start() {
-    init();
-
-    if (ze_base_entry::is_initialized && status == ccl_sched_entry_status_not_started) {
-        reset_sync_objects();
+void ze_onesided_allreduce_entry::finalize_ze_hook() {
+    if (empty_kernel_event) {
+        global_data::get().ze_cache->push(worker_idx, module, empty_kernel_name, empty_kernel);
     }
+    global_data::get().ze_cache->push(worker_idx, module, main_kernel_name, main_kernel);
+}
 
+void ze_onesided_allreduce_entry::start() {
     size_t kernel_counter = 0;
     if (global_data::env().enable_kernel_sync) {
         kernel_counter = global_data::get().kernel_counter++;
@@ -234,7 +190,6 @@ void ze_allreduce_entry::start() {
 
     if (kernel_counter == 0) {
         ze_base_entry::start();
-        status = ccl_sched_entry_status_started;
     }
     else {
         global_data::get().kernel_counter--;
@@ -242,58 +197,10 @@ void ze_allreduce_entry::start() {
     }
 }
 
-void ze_allreduce_entry::update() {
+void ze_onesided_allreduce_entry::update() {
     ze_base_entry::update();
-    if (status == ccl_sched_entry_status_complete && !sched->coll_attr.to_cache) {
-        finalize();
-    }
 
     if (global_data::env().enable_kernel_sync && global_data::get().kernel_counter > 0) {
         global_data::get().kernel_counter--;
     }
 }
-
-void ze_allreduce_entry::finalize() {
-    if (!ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("finalization");
-
-    /* events */
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        LOG_DEBUG("copy ops finalization");
-        ZE_CALL(zeEventDestroy, (copy_from_peer_event));
-        ZE_CALL(zeEventDestroy, (reduce_local_kernel_event));
-        /* device mem */
-        global_data::get().ze_cache->push(worker_idx,
-                                          context,
-                                          device,
-                                          device_mem_alloc_desc,
-                                          buf_size_bytes,
-                                          0, /*alignment*/
-                                          tmp_buf_ptr);
-    }
-
-    /* kernels */
-    if (empty_kernel_event) {
-        ZE_CALL(zeEventDestroy, (empty_kernel_event));
-        global_data::get().ze_cache->push(worker_idx, module, empty_kernel_name, empty_kernel);
-    }
-    global_data::get().ze_cache->push(worker_idx, module, main_kernel_name, main_kernel);
-
-    ze_base_entry::finalize();
-
-    LOG_DEBUG("finalization complete");
-}
-
-void ze_allreduce_entry::reset_sync_objects() {
-    if (empty_kernel_event) {
-        ZE_CALL(zeEventHostReset, (empty_kernel_event));
-    }
-
-    if (global_data::env().enable_kernel_1s_copy_ops) {
-        ZE_CALL(zeEventHostReset, (copy_from_peer_event));
-        ZE_CALL(zeEventHostReset, (reduce_local_kernel_event));
-    }
-}
diff --git a/src/sched/entry/gpu/ze_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
similarity index 71%
rename from src/sched/entry/gpu/ze_allreduce_entry.hpp
rename to src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
index 6231e44c0..309e42631 100644
--- a/src/sched/entry/gpu/ze_allreduce_entry.hpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp
@@ -17,41 +17,43 @@
 
 #include "common/utils/buffer.hpp"
 #include "comp/comp.hpp"
-#include "sched/entry/gpu/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
 
 #include <atomic>
 #include <sstream>
 
-class ze_allreduce_entry : public ze_base_entry {
+class ze_onesided_allreduce_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
-        return "ZE_ALLREDUCE";
+        return "ZE_1S_ALLREDUCE";
     }
 
     const char* name() const noexcept override {
         return class_name();
     }
 
-    ze_allreduce_entry() = delete;
-    explicit ze_allreduce_entry(ccl_sched* sched,
-                                ccl_buffer send_buf,
-                                ccl_buffer recv_buf,
-                                size_t cnt,
-                                const ccl_datatype& dtype,
-                                ccl::reduction op,
-                                ccl_comm* comm);
-    ~ze_allreduce_entry();
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << cnt;
+        return out.str();
+    }
 
-    void init();
-    void start() override;
-    void update() override;
-    void finalize();
+    ze_onesided_allreduce_entry() = delete;
+    explicit ze_onesided_allreduce_entry(ccl_sched* sched,
+                                         ccl_buffer send_buf,
+                                         ccl_buffer recv_buf,
+                                         size_t cnt,
+                                         const ccl_datatype& dtype,
+                                         ccl::reduction op,
+                                         ccl_comm* comm,
+                                         std::vector<ze_event_handle_t> wait_events = {});
 
-    void reset_sync_objects();
+    void init_ze_hook() override;
+    void finalize_ze_hook() override;
 
-    bool is_strict_order_satisfied() override {
-        return (status >= ccl_sched_entry_status_complete);
-    }
+    void start() override;
+    void update() override;
 
 protected:
     void dump_detail(std::stringstream& str) const override {
@@ -74,15 +76,12 @@ class ze_allreduce_entry : public ze_base_entry {
     }
 
 private:
-    static constexpr uint32_t local_events_count{ 3 };
-
     const ccl_buffer send_buf;
     const ccl_buffer recv_buf;
     void* send_buf_ptr{};
     void* recv_buf_ptr{};
     void* right_send_buf_ptr{};
     void* right_recv_buf_ptr{};
-    void* tmp_buf_ptr{};
     const unsigned long cnt;
     const ccl_datatype dtype;
     const ccl::reduction op;
@@ -92,8 +91,6 @@ class ze_allreduce_entry : public ze_base_entry {
     ze_event_handle_t copy_from_peer_event{};
     ze_event_handle_t reduce_local_kernel_event{};
 
-    ze_module_handle_t module{};
-
     ze_group_count_t group_count{};
 
     ze_kernel_handle_t main_kernel{};
@@ -101,6 +98,4 @@ class ze_allreduce_entry : public ze_base_entry {
 
     ze_kernel_handle_t empty_kernel{};
     std::string empty_kernel_name{ "empty_kernel" };
-
-    ze_device_mem_alloc_desc_t device_mem_alloc_desc;
 };
diff --git a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
new file mode 100644
index 000000000..e7d032e65
--- /dev/null
+++ b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.cpp
@@ -0,0 +1,562 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/stream/stream.hpp"
+#include "sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/queue/queue.hpp"
+
+#include <string>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_ring_allreduce_entry::ze_ring_allreduce_entry(ccl_sched* sched,
+                                                 ccl_buffer send_buf,
+                                                 ccl_buffer recv_buf,
+                                                 ccl_buffer tmp_buf,
+                                                 size_t cnt,
+                                                 const ccl_datatype& dtype,
+                                                 reduction op,
+                                                 ccl_comm* comm,
+                                                 size_t recv_buf_idx,
+                                                 size_t tmp_buf_idx)
+        : ze_base_entry(
+              sched,
+              (comm->size() == 1) ? init_mode::copy : (init_mode::compute | init_mode::copy),
+              comm,
+              (comm->size() - 1) * event_group_count),
+          send_buf(send_buf),
+          recv_buf(recv_buf),
+          tmp_buf(tmp_buf),
+          cnt(cnt),
+          dtype(dtype),
+          op(op),
+          recv_buf_idx(recv_buf_idx),
+          tmp_buf_idx(tmp_buf_idx),
+          stage_iter_count(comm->size() - 1),
+          total_iter_count(stage_iter_count * 2) {
+    skip_entry = (comm->size() == 1) && (send_buf == recv_buf);
+    if (skip_entry) {
+        // skip entry init and finalize
+        sched->get_memory().ze_entries.pop_back();
+    }
+    else {
+        atl_ops_init();
+    }
+}
+
+void ze_ring_allreduce_entry::atl_ops_init() {
+    left_peer = (comm_size + comm_rank - 1) % comm_size;
+    right_peer = (comm_rank + 1) % comm_size;
+    recv_tags.resize(total_iter_count);
+    send_tags.resize(total_iter_count);
+    sync_send_flags.resize(total_iter_count, comm_rank);
+
+    for (int i = 0; i < total_iter_count; ++i) {
+        send_tags[i] = comm->get_atl_comm()->tag->create(right_peer,
+                                                         sched->get_comm_id(),
+                                                         sched->sched_id,
+                                                         sched->get_op_id() + i + op_id_offset);
+        recv_tags[i] = comm->get_atl_comm()->tag->create(comm_rank,
+                                                         sched->get_comm_id(),
+                                                         sched->sched_id,
+                                                         sched->get_op_id() + i + op_id_offset);
+    }
+
+    LOG_DEBUG("atl_ops_init completed");
+}
+
+void ze_ring_allreduce_entry::recv_sync_flag(int idx) {
+    auto buf = &sync_recv_flags[idx];
+    auto bytes = sizeof(sync_recv_flags[idx]);
+    auto src = left_peer;
+    auto tag = recv_tags.at(idx);
+    atl_req_t* req = &recv_reqs[idx];
+
+    CCL_THROW_IF_NOT((left_peer != comm_rank) && (left_peer < comm_size),
+                     "unexpected src ",
+                     src,
+                     ", my rank ",
+                     comm_rank,
+                     ", left peer ",
+                     left_peer);
+
+    LOG_DEBUG("start recv: { src: ", src, ", tag: ", tag, ", bytes: ", bytes, "}");
+    auto status = comm->get_atl_comm()->recv(sched->bin->get_atl_ep(), buf, bytes, src, tag, req);
+    CCL_THROW_IF_NOT(status == ATL_STATUS_SUCCESS, "atl status: ", atl_status_to_str(status));
+}
+
+void ze_ring_allreduce_entry::send_sync_flag(int idx) {
+    auto buf = &sync_send_flags[idx];
+    auto bytes = sizeof(sync_send_flags[idx]);
+    auto dst = right_peer;
+    auto tag = send_tags.at(idx);
+    atl_req_t* req = &send_reqs[idx];
+
+    CCL_THROW_IF_NOT((right_peer != comm_rank) && (right_peer < comm_size),
+                     "unexpected dst ",
+                     dst,
+                     ", my rank ",
+                     comm_rank,
+                     ", right peer ",
+                     right_peer);
+
+    LOG_DEBUG("start send: { dst: ",
+              dst,
+              ", tag: ",
+              tag,
+              ", bytes: ",
+              bytes,
+              ", value: ",
+              sync_send_flags[idx],
+              "}");
+    auto status = comm->get_atl_comm()->send(sched->bin->get_atl_ep(), buf, bytes, dst, tag, req);
+    CCL_THROW_IF_NOT(status == ATL_STATUS_SUCCESS, "atl status: ", atl_status_to_str(status));
+}
+
+bool ze_ring_allreduce_entry::check_atl_req(atl_req_t* req) {
+    if (!req->is_completed) {
+        auto status = comm->get_atl_comm()->check(sched->bin->get_atl_ep(), req);
+        CCL_THROW_IF_NOT(status == ATL_STATUS_SUCCESS, "atl status: ", atl_status_to_str(status));
+    }
+    return req->is_completed;
+}
+
+void ze_ring_allreduce_entry::validate_sync_flags(int limit) {
+    for (int i = 0; i < total_iter_count; ++i) {
+        int value = sync_send_flags[i];
+        CCL_THROW_IF_NOT(value == comm_rank);
+        value = sync_recv_flags[i];
+        if (i < limit)
+            CCL_THROW_IF_NOT(value == left_peer);
+    }
+}
+
+void ze_ring_allreduce_entry::init_ze_hook() {
+    size_t dtype_size = dtype.size();
+    bool inplace = (send_buf == recv_buf);
+
+    if (comm_size == 1) {
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (ze_base_entry::get_copy_list(),
+                 recv_buf.get_ptr(),
+                 send_buf.get_ptr(),
+                 cnt * dtype_size,
+                 ze_base_entry::entry_event,
+                 0,
+                 nullptr));
+        return;
+    }
+
+    rs_copy_signal_events.resize(stage_iter_count);
+    rs_copy_wait_events.resize(stage_iter_count);
+    rs_reduce_signal_events.resize(stage_iter_count);
+    rs_reduce_wait_events.resize(stage_iter_count);
+    ag_copy_signal_events.resize(stage_iter_count);
+    ag_copy_wait_events.resize(stage_iter_count);
+
+    global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
+    std::string kernel_name =
+        "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
+    kernels.reserve(stage_iter_count);
+
+    for (int i = 0; i < stage_iter_count; ++i) {
+        rs_copy_signal_events[i] = ze_base_entry::create_event();
+        rs_copy_wait_events[i] = ze_base_entry::create_event();
+        rs_reduce_signal_events[i] = ze_base_entry::create_event();
+        rs_reduce_wait_events[i] = ze_base_entry::create_event();
+        ag_copy_signal_events[i] = ze_base_entry::create_event();
+        ag_copy_wait_events[i] = ze_base_entry::create_event();
+        kernels.emplace_back(module, kernel_name, worker_idx);
+    }
+
+    send_buf_ptr = send_buf.get_ptr();
+    recv_buf_ptr = recv_buf.get_ptr();
+    tmp_buf_ptr = tmp_buf.get_ptr();
+
+    ccl_buffer right_recv_buf;
+    int peer_rank = (comm_rank + 1) % comm_size;
+    sched->get_memory().handle_manager.get(peer_rank, recv_buf_idx, right_recv_buf, comm);
+    right_recv_buf_ptr = right_recv_buf.get_ptr();
+
+    if (inplace) {
+        ccl_buffer right_tmp_buf;
+        sched->get_memory().handle_manager.get(peer_rank, tmp_buf_idx, right_tmp_buf, comm);
+        right_tmp_buf_ptr = right_tmp_buf.get_ptr();
+    }
+
+    // reduce_scatter stage
+
+    size_t main_block_count = cnt / comm_size;
+    int block_idx = (comm_size + comm_rank - 1) % comm_size;
+
+    for (int i = 0; i < stage_iter_count; ++i) {
+        size_t block_count = main_block_count;
+        if (block_idx == (comm_size - 1))
+            block_count += cnt % comm_size;
+        int copy_offset = main_block_count * dtype_size * block_idx;
+
+        LOG_DEBUG("reduce_scatter: { my rank: ",
+                  comm->rank(),
+                  ", iter: ",
+                  i,
+                  ", copy_offset: ",
+                  copy_offset,
+                  ", block_count: ",
+                  block_count,
+                  " }");
+
+        void* src = nullptr;
+        void* dst = nullptr;
+        if (inplace) {
+            src = recv_buf_ptr;
+            dst = right_tmp_buf_ptr;
+        }
+        else {
+            src = (i == 0) ? send_buf_ptr : recv_buf_ptr;
+            dst = right_recv_buf_ptr;
+        }
+        src = (char*)src + copy_offset;
+        dst = (char*)dst + copy_offset;
+
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (ze_base_entry::get_copy_list(),
+                 dst,
+                 src,
+                 block_count * dtype_size,
+                 rs_copy_signal_events[i],
+                 1,
+                 &rs_copy_wait_events[i]));
+
+        block_idx = (block_idx + comm_size - 1) % comm_size;
+        block_count = main_block_count;
+        if (block_idx == (comm_size - 1))
+            block_count += cnt % comm_size;
+        int kernel_offset = main_block_count * dtype_size * block_idx;
+
+        LOG_DEBUG("reduce_scatter: { my rank: ",
+                  comm->rank(),
+                  ", iter: ",
+                  i,
+                  ", kernel_offset: ",
+                  copy_offset,
+                  ", block_count: ",
+                  block_count,
+                  " }");
+
+        void* input_buf = (inplace) ? tmp_buf_ptr : send_buf_ptr;
+        input_buf = (char*)input_buf + kernel_offset;
+        void* output_buf = (char*)recv_buf_ptr + kernel_offset;
+
+        kernels[i].set_args({ &block_count, &input_buf, &output_buf });
+        kernels[i].calculate_group_size(block_count);
+
+        ZE_CALL(zeCommandListAppendLaunchKernel,
+                (ze_base_entry::get_comp_list(),
+                 kernels[i].get_kernel(),
+                 kernels[i].get_group_count(),
+                 rs_reduce_signal_events[i],
+                 1,
+                 &rs_reduce_wait_events[i]));
+    }
+
+    // allgather stage
+
+    for (int i = 0; i < stage_iter_count; ++i) {
+        size_t block_count = main_block_count;
+        if (block_idx == (comm_size - 1))
+            block_count += cnt % comm_size;
+
+        int copy_offset = main_block_count * dtype_size * block_idx;
+
+        LOG_DEBUG("allgather: { my rank: ",
+                  comm->rank(),
+                  ", iter: ",
+                  i,
+                  ", copy offset: ",
+                  copy_offset,
+                  ", block_count: ",
+                  block_count,
+                  " }");
+        void* src = (char*)recv_buf_ptr + copy_offset;
+        void* dst = (char*)right_recv_buf_ptr + copy_offset;
+
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (ze_base_entry::get_copy_list(),
+                 dst,
+                 src,
+                 block_count * dtype_size,
+                 ag_copy_signal_events[i],
+                 1,
+                 &ag_copy_wait_events[i]));
+
+        block_idx = (block_idx + comm_size - 1) % comm_size;
+    }
+}
+
+void ze_ring_allreduce_entry::finalize_ze_hook() {
+    kernels.clear();
+}
+
+void ze_ring_allreduce_entry::start() {
+    if (skip_entry) {
+        ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+        status = ccl_sched_entry_status_complete;
+        return;
+    }
+
+    reset_fields();
+
+    for (int i = 0; i < total_iter_count; ++i) {
+        recv_sync_flag(i);
+    }
+
+    ze_base_entry::start();
+
+    for (int i = 0; i < total_iter_count; ++i) {
+        CCL_THROW_IF_NOT(!send_reqs[i].is_completed);
+        CCL_THROW_IF_NOT(!recv_reqs[i].is_completed);
+    }
+
+    for (int i = 0; i < stage_iter_count; ++i) {
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_signal_events[i]));
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_wait_events[i]));
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_reduce_signal_events[i]));
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_reduce_wait_events[i]));
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_signal_events[i]));
+        CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[i]));
+    }
+}
+
+void ze_ring_allreduce_entry::update() {
+    if (comm_size == 1) {
+        ze_base_entry::update();
+        return;
+    }
+
+    if (iter_idx > 0) {
+        validate_sync_flags(iter_idx - 1);
+    }
+
+    while (!is_rs_completed && (iter_idx < stage_iter_count)) {
+        for (int i = iter_idx + 1; i < stage_iter_count; ++i) {
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_signal_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_wait_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_reduce_signal_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_reduce_wait_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_signal_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[i]));
+            CCL_THROW_IF_NOT(!send_reqs[i].is_completed);
+            CCL_THROW_IF_NOT(!recv_reqs[i].is_completed);
+        }
+
+        if (!rs_copy_started[iter_idx]) {
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_wait_events[iter_idx]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(rs_copy_signal_events[iter_idx]));
+
+            if (iter_idx > 0) {
+                CCL_THROW_IF_NOT(
+                    ze_base_entry::is_event_completed(rs_reduce_signal_events[iter_idx - 1]));
+                CCL_THROW_IF_NOT(
+                    ze_base_entry::is_event_completed(rs_reduce_wait_events[iter_idx - 1]));
+                CCL_THROW_IF_NOT(recv_reqs[iter_idx - 1].is_completed);
+            }
+
+            ZE_CALL(zeEventHostSignal, (rs_copy_wait_events[iter_idx]));
+            rs_copy_started[iter_idx] = true;
+
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_wait_events[iter_idx]));
+        }
+
+        if (!rs_sync_sent[iter_idx] &&
+            (ze_base_entry::is_event_completed(rs_copy_signal_events[iter_idx]))) {
+            send_sync_flag(iter_idx);
+            rs_sync_sent[iter_idx] = true;
+        }
+
+        if (!rs_reduce_started[iter_idx]) {
+            auto is_recv_completed = check_atl_req(&recv_reqs[iter_idx]);
+            if (is_recv_completed) {
+                CCL_THROW_IF_NOT(sync_recv_flags[iter_idx] == left_peer,
+                                 "iter ",
+                                 iter_idx,
+                                 ", expected ",
+                                 left_peer,
+                                 ", got ",
+                                 sync_recv_flags[iter_idx]);
+                CCL_THROW_IF_NOT(
+                    !ze_base_entry::is_event_completed(rs_reduce_wait_events[iter_idx]));
+                CCL_THROW_IF_NOT(
+                    !ze_base_entry::is_event_completed(rs_reduce_signal_events[iter_idx]));
+
+                ZE_CALL(zeEventHostSignal, (rs_reduce_wait_events[iter_idx]));
+                rs_reduce_started[iter_idx] = true;
+
+                CCL_THROW_IF_NOT(
+                    ze_base_entry::is_event_completed(rs_reduce_wait_events[iter_idx]));
+            }
+            else {
+                return;
+            }
+        }
+
+        if ((ze_base_entry::is_event_completed(rs_reduce_signal_events[iter_idx])) &&
+            rs_sync_sent[iter_idx] && check_atl_req(&send_reqs[iter_idx])) {
+            LOG_DEBUG("completed reduce_scatter iter ", iter_idx);
+
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_signal_events[iter_idx]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_wait_events[iter_idx]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_signal_events[iter_idx]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_wait_events[iter_idx]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_signal_events[iter_idx]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[iter_idx]));
+            CCL_THROW_IF_NOT(send_reqs[iter_idx].is_completed);
+            CCL_THROW_IF_NOT(recv_reqs[iter_idx].is_completed);
+
+            validate_sync_flags(iter_idx);
+
+            iter_idx++;
+        }
+        else {
+            return;
+        }
+    }
+
+    if (!is_rs_completed) {
+        is_rs_completed = true;
+        iter_idx = 0;
+
+        for (int i = 0; i < stage_iter_count; ++i) {
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_signal_events[i]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_wait_events[i]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_signal_events[i]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_wait_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_signal_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[i]));
+            CCL_THROW_IF_NOT(send_reqs[i].is_completed);
+            CCL_THROW_IF_NOT(recv_reqs[i].is_completed);
+        }
+    }
+
+    validate_sync_flags(stage_iter_count);
+
+    while (!is_ag_completed && (iter_idx < stage_iter_count)) {
+        for (int i = iter_idx + 1; i < stage_iter_count; ++i) {
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_signal_events[i]));
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[i]));
+            CCL_THROW_IF_NOT(!send_reqs[i + stage_iter_count].is_completed);
+            CCL_THROW_IF_NOT(!recv_reqs[i + stage_iter_count].is_completed);
+        }
+
+        if (!ag_copy_started[iter_idx]) {
+            CCL_THROW_IF_NOT(!ze_base_entry::is_event_completed(ag_copy_wait_events[iter_idx]));
+            if (iter_idx > 0) {
+                CCL_THROW_IF_NOT(
+                    ze_base_entry::is_event_completed(ag_copy_signal_events[iter_idx - 1]));
+                CCL_THROW_IF_NOT(
+                    ze_base_entry::is_event_completed(ag_copy_wait_events[iter_idx - 1]));
+                CCL_THROW_IF_NOT(recv_reqs[stage_iter_count + iter_idx - 1].is_completed);
+            }
+
+            ZE_CALL(zeEventHostSignal, (ag_copy_wait_events[iter_idx]));
+            ag_copy_started[iter_idx] = true;
+
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(ag_copy_wait_events[iter_idx]));
+        }
+
+        if (!ag_sync_sent[iter_idx] &&
+            (ze_base_entry::is_event_completed(ag_copy_signal_events[iter_idx]))) {
+            send_sync_flag(iter_idx + stage_iter_count);
+            ag_sync_sent[iter_idx] = true;
+        }
+
+        auto is_send_completed =
+            ag_sync_sent[iter_idx] && check_atl_req(&send_reqs[iter_idx + stage_iter_count]);
+        auto is_recv_completed = check_atl_req(&recv_reqs[iter_idx + stage_iter_count]);
+        if (is_send_completed && is_recv_completed) {
+            LOG_DEBUG("completed allgatherv iter ", iter_idx);
+
+            CCL_THROW_IF_NOT(sync_recv_flags[iter_idx + stage_iter_count] == left_peer);
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(ag_copy_signal_events[iter_idx]));
+            CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(ag_copy_wait_events[iter_idx]));
+            CCL_THROW_IF_NOT(send_reqs[iter_idx + stage_iter_count].is_completed);
+            CCL_THROW_IF_NOT(recv_reqs[iter_idx + stage_iter_count].is_completed);
+
+            validate_sync_flags(iter_idx);
+
+            ++iter_idx;
+        }
+        else {
+            return;
+        }
+    }
+
+    is_ag_completed = true;
+
+    for (int i = 0; i < stage_iter_count; ++i) {
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_signal_events[i]));
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_copy_wait_events[i]));
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_signal_events[i]));
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(rs_reduce_wait_events[i]));
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(ag_copy_signal_events[i]));
+        CCL_THROW_IF_NOT(ze_base_entry::is_event_completed(ag_copy_wait_events[i]));
+
+        CCL_THROW_IF_NOT(send_reqs[i].is_completed);
+        CCL_THROW_IF_NOT(recv_reqs[i].is_completed);
+        CCL_THROW_IF_NOT(send_reqs[i + stage_iter_count].is_completed);
+        CCL_THROW_IF_NOT(recv_reqs[i + stage_iter_count].is_completed);
+    }
+    validate_sync_flags(total_iter_count);
+
+    ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+    ze_base_entry::update();
+}
+
+void ze_ring_allreduce_entry::reset_fields() {
+    if (comm_size == 1) {
+        return;
+    }
+
+    iter_idx = 0;
+    is_rs_completed = is_ag_completed = false;
+
+    send_reqs.clear();
+    send_reqs.resize(total_iter_count);
+    recv_reqs.clear();
+    recv_reqs.resize(total_iter_count);
+
+    if (sync_recv_flags.empty()) {
+        sync_recv_flags.resize(total_iter_count, ccl_comm::invalid_rank);
+
+        rs_sync_sent.resize(stage_iter_count, false);
+        ag_sync_sent.resize(stage_iter_count, false);
+
+        rs_copy_started.resize(stage_iter_count, false);
+        rs_reduce_started.resize(stage_iter_count, false);
+        ag_copy_started.resize(stage_iter_count, false);
+    }
+    else {
+        std::fill(sync_recv_flags.begin(), sync_recv_flags.end(), ccl_comm::invalid_rank);
+
+        std::fill(rs_sync_sent.begin(), rs_sync_sent.end(), false);
+        std::fill(ag_sync_sent.begin(), ag_sync_sent.end(), false);
+
+        std::fill(rs_copy_started.begin(), rs_copy_started.end(), false);
+        std::fill(rs_reduce_started.begin(), rs_reduce_started.end(), false);
+        std::fill(ag_copy_started.begin(), ag_copy_started.end(), false);
+    }
+}
diff --git a/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp
new file mode 100644
index 000000000..43a7c667c
--- /dev/null
+++ b/src/sched/entry/ze/allreduce/ze_ring_allreduce_entry.hpp
@@ -0,0 +1,145 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/utils/buffer.hpp"
+#include "comp/comp.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_ring_allreduce_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_RING_ALLREDUCE";
+    }
+
+    const char* name() const noexcept override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << cnt;
+        return out.str();
+    }
+
+    ze_ring_allreduce_entry() = delete;
+    explicit ze_ring_allreduce_entry(ccl_sched* sched,
+                                     ccl_buffer send_buf,
+                                     ccl_buffer recv_buf,
+                                     ccl_buffer tmp_buf,
+                                     size_t cnt,
+                                     const ccl_datatype& dtype,
+                                     ccl::reduction op,
+                                     ccl_comm* comm,
+                                     size_t recv_buf_idx = 1,
+                                     size_t tmp_buf_idx = 0);
+
+    void init_ze_hook() override;
+    void finalize_ze_hook() override;
+
+    void start() override;
+    void update() override;
+
+    void reset_fields();
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str,
+                           "dt ",
+                           ccl::global_data::get().dtypes->name(dtype),
+                           ", cnt ",
+                           cnt,
+                           ", send_buf ",
+                           send_buf,
+                           ", recv_buf ",
+                           recv_buf,
+                           ", op ",
+                           ccl_reduction_to_str(op),
+                           ", comm_id ",
+                           sched->get_comm_id(),
+                           ", context ",
+                           context,
+                           "\n");
+    }
+
+private:
+    static constexpr uint32_t local_events_count{ 3 };
+    static constexpr uint32_t op_id_offset{ 16 };
+
+    const ccl_buffer send_buf;
+    const ccl_buffer recv_buf;
+    const ccl_buffer tmp_buf;
+    void* send_buf_ptr{};
+    void* recv_buf_ptr{};
+    void* tmp_buf_ptr{};
+    void* right_recv_buf_ptr{};
+    void* right_tmp_buf_ptr{};
+    const unsigned long cnt;
+    const ccl_datatype dtype;
+    const ccl::reduction op;
+    const size_t recv_buf_idx;
+    const size_t tmp_buf_idx;
+
+    int iter_idx{};
+    const int stage_iter_count;
+    const int total_iter_count;
+
+    int left_peer{};
+    int right_peer{};
+
+    bool is_rs_completed{};
+    bool is_ag_completed{};
+
+    /* atl */
+
+    std::vector<atl_req_t> recv_reqs{};
+    std::vector<atl_req_t> send_reqs{};
+
+    std::vector<uint64_t> send_tags;
+    std::vector<uint64_t> recv_tags;
+
+    std::vector<int> sync_send_flags;
+    std::vector<int> sync_recv_flags;
+
+    std::vector<bool> rs_sync_sent;
+    std::vector<bool> ag_sync_sent;
+
+    void atl_ops_init();
+    void send_sync_flag(int idx);
+    void recv_sync_flag(int idx);
+    void validate_sync_flags(int limit);
+    bool check_atl_req(atl_req_t* req);
+    void reset_atl_reqs();
+
+    /* gpu */
+
+    static constexpr size_t event_group_count =
+        6; // (rs_copy + rs_reduce + ag_copy) x (signal + wait)
+    std::vector<ze_event_handle_t> rs_copy_signal_events;
+    std::vector<ze_event_handle_t> rs_copy_wait_events;
+    std::vector<ze_event_handle_t> rs_reduce_signal_events;
+    std::vector<ze_event_handle_t> rs_reduce_wait_events;
+    std::vector<ze_event_handle_t> ag_copy_signal_events;
+    std::vector<ze_event_handle_t> ag_copy_wait_events;
+    std::vector<ze_kernel> kernels;
+
+    std::vector<bool> rs_copy_started;
+    std::vector<bool> rs_reduce_started;
+    std::vector<bool> ag_copy_started;
+
+    bool skip_entry{};
+};
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
new file mode 100644
index 000000000..687940157
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
@@ -0,0 +1,128 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include <numeric>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_a2a_allgatherv_entry::ze_a2a_allgatherv_entry(ccl_sched* sched,
+                                                 ccl_buffer send_buf,
+                                                 size_t send_count,
+                                                 ccl_buffer recv_buf,
+                                                 const size_t* recv_counts,
+                                                 const ccl_datatype& dtype,
+                                                 ccl_comm* comm,
+                                                 std::vector<ze_event_handle_t> wait_events,
+                                                 size_t peer_buf_idx,
+                                                 size_t peer_buf_offset)
+        : ze_base_entry(sched,
+                        init_mode::copy,
+                        comm,
+                        comm->size() * event_group_count,
+                        wait_events),
+          send_buf(send_buf),
+          send_count(send_count),
+          recv_buf(recv_buf),
+          recv_counts(recv_counts, recv_counts + comm->size()),
+          dtype(dtype),
+          peer_buf_idx(peer_buf_idx),
+          peer_buf_offset(peer_buf_offset),
+          peer_count(comm->size() - 1) {}
+
+void ze_a2a_allgatherv_entry::fill_list(ze_command_list_handle_t list,
+                                        void* send_buf,
+                                        void* recv_buf,
+                                        const std::vector<ccl_buffer>& peer_recv_bufs,
+                                        int peer_count,
+                                        size_t copy_bytes,
+                                        size_t offset_bytes,
+                                        bool is_inplace,
+                                        std::vector<ze_event_handle_t>& copy_events,
+                                        ze_event_handle_t wait_event) {
+    /* copy send_buf to peer buffers */
+    for (int i = 0; i < peer_count; ++i) {
+        void* src = send_buf;
+        if (is_inplace) {
+            src = static_cast<char*>(recv_buf) + offset_bytes;
+        }
+        void* dst = static_cast<char*>(peer_recv_bufs[i].get_ptr()) + offset_bytes;
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (list, dst, src, copy_bytes, copy_events.at(i), (wait_event) ? 1 : 0, &wait_event));
+    }
+
+    if (!is_inplace) {
+        /* copy send_buf to my buffer */
+        void* src = send_buf;
+        void* dst = static_cast<char*>(recv_buf) + offset_bytes;
+        ZE_CALL(
+            zeCommandListAppendMemoryCopy,
+            (list, dst, src, copy_bytes, copy_events.back(), (wait_event) ? 1 : 0, &wait_event));
+    }
+}
+
+void ze_a2a_allgatherv_entry::init_ze_hook() {
+    /* get peer recv buffers */
+    std::vector<ccl_buffer> peer_recv_bufs(peer_count);
+
+    for (int i = 0; i < peer_count; ++i) {
+        int peer_rank = (comm_rank + i + 1) % comm->size();
+        ccl_buffer buf{};
+        sched->get_memory().handle_manager.get(peer_rank, peer_buf_idx, buf, comm);
+        CCL_THROW_IF_NOT(buf.get_ptr(), "null IPC buffer is received");
+        peer_recv_bufs[i] = buf + peer_buf_offset * dtype.size();
+    }
+
+    bool is_inplace{};
+    if (send_buf == recv_buf) {
+        is_inplace = true;
+    }
+
+    size_t offset_count = std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
+    size_t offset_bytes = offset_count * dtype.size();
+    size_t block_bytes =
+        (!is_inplace) ? (send_count * dtype.size()) : recv_counts[comm_rank] * dtype.size();
+    LOG_DEBUG("rank: ", comm_rank, ", block_bytes: ", block_bytes);
+
+    copy_events.resize((!is_inplace) ? comm_size : peer_count);
+    for (auto& event : copy_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    fill_list(ze_base_entry::get_copy_list(),
+              send_buf.get_ptr(),
+              recv_buf.get_ptr(),
+              peer_recv_bufs,
+              peer_count,
+              block_bytes,
+              offset_bytes,
+              is_inplace,
+              copy_events);
+}
+
+void ze_a2a_allgatherv_entry::update() {
+    for (const auto& event : copy_events) {
+        if (!ze_base_entry::is_event_completed(event)) {
+            return;
+        }
+    }
+
+    ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+    ze_base_entry::update();
+}
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
new file mode 100644
index 000000000..e84a83c56
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
@@ -0,0 +1,77 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/utils/buffer.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_a2a_allgatherv_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_ALLGATHERV";
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "send size: " << send_count;
+        return out.str();
+    }
+
+    explicit ze_a2a_allgatherv_entry(ccl_sched* sched,
+                                     ccl_buffer send_buf,
+                                     size_t send_count,
+                                     ccl_buffer recv_buf,
+                                     const size_t* recv_counts,
+                                     const ccl_datatype& dtype,
+                                     ccl_comm* comm,
+                                     std::vector<ze_event_handle_t> wait_events = {},
+                                     size_t peer_buf_idx = 0,
+                                     size_t peer_buf_offset = 0);
+
+    void init_ze_hook() override;
+
+    void update() override;
+
+    static void fill_list(ze_command_list_handle_t list,
+                          void* send_buf,
+                          void* recv_buf,
+                          const std::vector<ccl_buffer>& peer_recv_bufs,
+                          int peer_count,
+                          size_t copy_bytes,
+                          size_t offset_bytes,
+                          bool is_inplace,
+                          std::vector<ze_event_handle_t>& copy_events,
+                          ze_event_handle_t wait_event = nullptr);
+
+private:
+    static constexpr size_t event_group_count{ 1 }; // copy phase
+
+    const ccl_buffer send_buf;
+    const size_t send_count;
+    const ccl_buffer recv_buf;
+    const std::vector<size_t> recv_counts;
+    const ccl_datatype dtype;
+    const size_t peer_buf_idx;
+    const size_t peer_buf_offset;
+    const int peer_count;
+
+    std::vector<ze_event_handle_t> copy_events;
+};
diff --git a/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp
new file mode 100644
index 000000000..01e7c28cb
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_gatherv_entry.cpp
@@ -0,0 +1,76 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/ze/ze_a2a_gatherv_entry.hpp"
+
+#include <numeric>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_a2a_gatherv_entry::ze_a2a_gatherv_entry(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           int root,
+                                           ccl_comm* comm,
+                                           size_t peer_buf_idx)
+        : ze_base_entry(sched, init_mode::copy, comm),
+          send_buf(send_buf),
+          send_bytes(send_count * dtype.size()),
+          recv_buf(recv_buf),
+          recv_counts(recv_counts, recv_counts + comm->size()),
+          dtype(dtype),
+          root(root),
+          peer_buf_idx(peer_buf_idx) {}
+
+void ze_a2a_gatherv_entry::init_ze_hook() {
+    /* get peer recv buffers */
+    ccl_buffer peer_recv_buf{};
+
+    bool is_root = comm_rank == root;
+    if (!is_root) {
+        sched->get_memory().handle_manager.get(root, peer_buf_idx, peer_recv_buf, comm);
+        CCL_THROW_IF_NOT(peer_recv_buf.get_ptr(), "null IPC buffer is received");
+    }
+    else {
+        peer_recv_buf = recv_buf;
+    }
+
+    bool is_inplace = send_buf == recv_buf;
+
+    size_t offset_count = std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
+    size_t offset_bytes = offset_count * dtype.size();
+    size_t block_bytes = (!is_inplace) ? send_bytes : recv_counts[comm_rank] * dtype.size();
+    LOG_DEBUG("rank: ", comm_rank, ", block_bytes: ", block_bytes);
+
+    if (!is_root || (is_root && !is_inplace)) {
+        void* src = send_buf.get_ptr();
+        if (is_inplace) {
+            src = static_cast<char*>(recv_buf.get_ptr()) + offset_bytes;
+        }
+        void* dst = static_cast<char*>(peer_recv_buf.get_ptr()) + offset_bytes;
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (ze_base_entry::get_copy_list(),
+                 dst,
+                 src,
+                 block_bytes,
+                 ze_base_entry::entry_event,
+                 0,
+                 nullptr));
+    }
+}
diff --git a/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp b/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp
new file mode 100644
index 000000000..5c1b1d9a0
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_gatherv_entry.hpp
@@ -0,0 +1,58 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/utils/buffer.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_a2a_gatherv_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_A2A_GATHERV";
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "send size: " << send_bytes;
+        return out.str();
+    }
+
+    explicit ze_a2a_gatherv_entry(ccl_sched* sched,
+                                  ccl_buffer send_buf,
+                                  size_t send_count,
+                                  ccl_buffer recv_buf,
+                                  const size_t* recv_counts,
+                                  const ccl_datatype& dtype,
+                                  int root,
+                                  ccl_comm* comm,
+                                  size_t peer_buf_idx = 0);
+
+    void init_ze_hook() override;
+
+private:
+    const ccl_buffer send_buf;
+    const size_t send_bytes;
+    const ccl_buffer recv_buf;
+    const std::vector<size_t> recv_counts;
+    const ccl_datatype dtype;
+    const int root;
+    const size_t peer_buf_idx;
+};
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
new file mode 100644
index 000000000..c5256f49a
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
@@ -0,0 +1,223 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include <numeric>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_a2a_reduce_scatter_entry::ze_a2a_reduce_scatter_entry(ccl_sched* sched,
+                                                         ccl_buffer send_buf,
+                                                         ccl_buffer recv_buf,
+                                                         const size_t* recv_counts,
+                                                         const ccl_datatype& dtype,
+                                                         reduction op,
+                                                         ccl_comm* comm,
+                                                         std::vector<ze_event_handle_t> wait_events,
+                                                         size_t peer_buf_idx)
+        : ze_base_entry(sched,
+                        (init_mode::compute | init_mode::copy),
+                        comm,
+                        comm->size() * event_group_count,
+                        wait_events),
+          send_buf(send_buf),
+          recv_buf(recv_buf),
+          dtype(dtype),
+          op(op),
+          recv_counts(recv_counts, recv_counts + comm->size()),
+          peer_buf_idx(peer_buf_idx),
+          peer_count(comm->size() - 1) {}
+
+void ze_a2a_reduce_scatter_entry::kernel_init(size_t offset_bytes,
+                                              size_t block_count,
+                                              void* send_buf,
+                                              void* base_ptr,
+                                              int peer_count,
+                                              const ccl_datatype& dtype,
+                                              int comm_rank,
+                                              std::vector<ze_kernel>& kernels,
+                                              ze_module_handle_t module,
+                                              ze_device_handle_t device,
+                                              ze_context_handle_t context,
+                                              ccl::reduction op,
+                                              size_t worker_idx) {
+    global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
+    std::string kernel_name =
+        "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
+
+    /* reduce peer values in tmp_buf only */
+    kernels.reserve(peer_count);
+    unsigned long count = block_count;
+    for (int i = 1; i < peer_count; ++i) {
+        void* input_buf = static_cast<char*>(base_ptr) + i * block_count * dtype.size();
+        void* inoutput_buf = base_ptr;
+        kernels.emplace_back(module, kernel_name, worker_idx);
+        kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
+        kernels.back().calculate_group_size(count);
+    }
+
+    /* reduce send_buf + tmp_buf */
+    void* input_buf = static_cast<char*>(send_buf) + offset_bytes;
+    void* inoutput_buf = base_ptr;
+    kernels.emplace_back(module, kernel_name, worker_idx);
+    kernels.back().set_args({ &count, &input_buf, &inoutput_buf });
+    kernels.back().calculate_group_size(count);
+}
+
+void ze_a2a_reduce_scatter_entry::fill_list(ze_command_list_handle_t copy_list,
+                                            ze_command_list_handle_t comp_list,
+                                            void* send_buf,
+                                            void* tmp_buf,
+                                            const std::vector<ccl_buffer>& peer_send_bufs,
+                                            int peer_count,
+                                            int comm_rank,
+                                            size_t block_count,
+                                            size_t offset_bytes,
+                                            std::vector<ze_event_handle_t>& copy_events,
+                                            std::vector<ze_kernel>& kernels,
+                                            std::vector<ze_event_handle_t>& kernel_events,
+                                            ze_event_handle_t& barrier_event,
+                                            const ccl_datatype& dtype,
+                                            ze_module_handle_t module,
+                                            ze_device_handle_t device,
+                                            ze_context_handle_t context,
+                                            ccl::reduction op,
+                                            size_t worker_idx) {
+    kernel_init(offset_bytes,
+                block_count,
+                send_buf,
+                tmp_buf,
+                peer_count,
+                dtype,
+                comm_rank,
+                kernels,
+                module,
+                device,
+                context,
+                op,
+                worker_idx);
+    size_t copy_bytes = block_count * dtype.size();
+    /* copy peer segments to temp buffer */
+    for (int i = 0; i < peer_count; i++) {
+        void* src = static_cast<char*>(peer_send_bufs[i].get_ptr()) + offset_bytes;
+        void* dst = static_cast<char*>(tmp_buf) + i * copy_bytes;
+        ZE_CALL(zeCommandListAppendMemoryCopy,
+                (copy_list, dst, src, copy_bytes, copy_events.at(i), 0, nullptr));
+    }
+
+    ZE_CALL(zeCommandListAppendBarrier,
+            (copy_list, barrier_event, copy_events.size(), copy_events.data()));
+
+    /* reduce stage */
+    for (size_t i = 0; i < kernels.size(); ++i) {
+        ZE_CALL(zeCommandListAppendLaunchKernel,
+                (comp_list,
+                 kernels[i].get_kernel(),
+                 kernels[i].get_group_count(),
+                 kernel_events.at(i),
+                 1,
+                 (i == 0) ? &barrier_event : &kernel_events.at(i - 1)));
+    }
+}
+
+void ze_a2a_reduce_scatter_entry::init_ze_hook() {
+    /* get peer buffers */
+    std::vector<ccl_buffer> peer_send_bufs(peer_count);
+
+    for (int i = 0; i < peer_count; ++i) {
+        int peer_rank = (comm_rank + i + 1) % comm->size();
+        sched->get_memory().handle_manager.get(peer_rank, peer_buf_idx, peer_send_bufs[i], comm);
+        CCL_THROW_IF_NOT(peer_send_bufs[i].get_ptr(), "null IPC buffer is received");
+    }
+
+    /* alloc temp buffer */
+    size_t buf_bytes = dtype.size() * recv_counts[comm_rank];
+    size_t tmp_buf_bytes = peer_count * buf_bytes;
+    if (tmp_buf_bytes == 0) {
+        return;
+    }
+    ccl::alloc_param alloc_param(tmp_buf_bytes, buffer_type::ze, buffer_place::device);
+    void* tmp_buf = sched->alloc_buffer(alloc_param).get_ptr();
+
+    LOG_DEBUG("rank ",
+              comm_size,
+              ", tmp buf size: ",
+              tmp_buf_bytes,
+              ", buf_count: ",
+              recv_counts[comm_rank]);
+
+    /* copy peer segments to temp buffer */
+
+    pre_copy_events.resize(peer_count);
+    for (auto& event : pre_copy_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    kernel_events.resize(peer_count);
+    for (auto& event : kernel_events) {
+        event = ze_base_entry::create_event();
+    }
+
+    size_t offset_count = std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
+    size_t offset_bytes = offset_count * dtype.size();
+
+    barrier_event = ze_base_entry::create_event();
+
+    fill_list(ze_base_entry::get_copy_list(),
+              ze_base_entry::get_comp_list(),
+              send_buf.get_ptr(),
+              tmp_buf,
+              peer_send_bufs,
+              peer_count,
+              comm_rank,
+              recv_counts[comm_rank],
+              offset_bytes,
+              pre_copy_events,
+              kernels,
+              kernel_events,
+              barrier_event,
+              dtype,
+              module,
+              device,
+              context,
+              op,
+              worker_idx);
+    post_copy_events.resize(1);
+    for (auto& event : post_copy_events) {
+        event = ze_base_entry::create_event();
+    }
+    ZE_CALL(zeCommandListAppendMemoryCopy,
+            (ze_base_entry::get_copy_list(),
+             recv_buf.get_ptr(),
+             tmp_buf,
+             buf_bytes,
+             post_copy_events.back(),
+             1,
+             &kernel_events.back()));
+}
+
+void ze_a2a_reduce_scatter_entry::update() {
+    for (const auto& event : post_copy_events) {
+        if (!ze_base_entry::is_event_completed(event)) {
+            return;
+        }
+    }
+    ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
+    ze_base_entry::update();
+}
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
new file mode 100644
index 000000000..4da644026
--- /dev/null
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
@@ -0,0 +1,106 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <numeric>
+#include "common/utils/buffer.hpp"
+#include "comp/comp.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_a2a_reduce_scatter_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_A2A_REDUCE_SCATTER";
+    }
+
+    const char* name() const noexcept override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+        return out.str();
+    }
+
+    ze_a2a_reduce_scatter_entry() = delete;
+    explicit ze_a2a_reduce_scatter_entry(ccl_sched* sched,
+                                         ccl_buffer send_buf,
+                                         ccl_buffer recv_buf,
+                                         const size_t* recv_counts,
+                                         const ccl_datatype& dtype,
+                                         ccl::reduction op,
+                                         ccl_comm* comm,
+                                         std::vector<ze_event_handle_t> wait_events = {},
+                                         size_t peer_buf_idx = 0);
+
+    void init_ze_hook() override;
+
+    void update() override;
+
+    static void fill_list(ze_command_list_handle_t list,
+                          ze_command_list_handle_t comp_primitives_list,
+                          void* send_buf,
+                          void* recv_buf,
+                          const std::vector<ccl_buffer>& peer_send_bufs,
+                          int peer_count,
+                          int comm_rank,
+                          size_t block_count,
+                          size_t offset_bytes,
+                          std::vector<ze_event_handle_t>& copy_events,
+                          std::vector<ze_kernel>& kernels,
+                          std::vector<ze_event_handle_t>& kernel_events,
+                          ze_event_handle_t& barrier_event,
+                          const ccl_datatype& dtype,
+                          ze_module_handle_t module,
+                          ze_device_handle_t device,
+                          ze_context_handle_t context,
+                          ccl::reduction op,
+                          size_t worker_idx);
+
+private:
+    static constexpr size_t event_group_count{ 3 }; // copy + kernel + copy
+
+    const ccl_buffer send_buf;
+    const ccl_buffer recv_buf;
+    const ccl_datatype dtype;
+    const ccl::reduction op;
+    const std::vector<size_t> recv_counts;
+    const size_t peer_buf_idx;
+    const int peer_count;
+
+    std::vector<ze_event_handle_t> pre_copy_events;
+    std::vector<ze_event_handle_t> post_copy_events;
+    ze_event_handle_t barrier_event{};
+
+    std::vector<ze_kernel> kernels;
+    std::vector<ze_event_handle_t> kernel_events;
+
+    static void kernel_init(size_t offset_bytes,
+                            size_t block_count,
+                            void* send_buf,
+                            void* base_ptr,
+                            int peer_count,
+                            const ccl_datatype& dtype,
+                            int comm_rank,
+                            std::vector<ze_kernel>& kernels,
+                            ze_module_handle_t module,
+                            ze_device_handle_t device,
+                            ze_context_handle_t context,
+                            ccl::reduction op,
+                            size_t worker_idx);
+};
diff --git a/src/sched/entry/ze/ze_barrier_entry.cpp b/src/sched/entry/ze/ze_barrier_entry.cpp
new file mode 100644
index 000000000..ea0d0e9d5
--- /dev/null
+++ b/src/sched/entry/ze/ze_barrier_entry.cpp
@@ -0,0 +1,100 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "ze_barrier_entry.hpp"
+
+#include "sched/entry/ze/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+ze_barrier_entry::ze_barrier_entry(ccl_sched* sched,
+                                   ccl_comm* comm,
+                                   ze_event_pool_handle_t& local_pool,
+                                   size_t event_idx)
+        : sched_entry(sched),
+          comm(comm),
+          rank(comm->rank()),
+          comm_size(comm->size()),
+          event_idx(event_idx),
+          local_pool(local_pool) {
+    LOG_DEBUG("initialization");
+    CCL_THROW_IF_NOT(sched, "no sched");
+    CCL_THROW_IF_NOT(comm, "no comm");
+    CCL_THROW_IF_NOT(local_pool, "no local event pool");
+}
+
+ze_barrier_entry::~ze_barrier_entry() {
+    finalize();
+}
+
+void ze_barrier_entry::finalize() {
+    LOG_DEBUG("finalization");
+    ZE_CALL(zeEventDestroy, (signal_event));
+
+    for (const auto& wait_event : wait_events) {
+        ZE_CALL(zeEventDestroy, (wait_event.second));
+    }
+    wait_events.clear();
+    LOG_DEBUG("finalization completed");
+}
+
+void ze_barrier_entry::start() {
+    LOG_DEBUG("start");
+
+    if (signal_event) {
+        ZE_CALL(zeEventHostReset, (signal_event));
+    }
+
+    last_completed_event_idx = 0;
+
+    ze_event_desc_t event_desc = default_event_desc;
+    event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; //TODO: DEVICE
+    event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
+    event_desc.index = event_idx;
+
+    signal_event = ze_base_entry::create_event(local_pool, event_desc);
+    LOG_DEBUG("signal event is created for rank: ", rank);
+
+    for (int peer_rank = 0; peer_rank < comm_size; peer_rank++) {
+        if (peer_rank == rank) {
+            continue;
+        }
+        ze_event_pool_handle_t event_pool{};
+        sched->get_memory().handle_manager.get(peer_rank, 2, event_pool, comm);
+        wait_events.push_back({ peer_rank, ze_base_entry::create_event(event_pool, event_desc) });
+        LOG_DEBUG(
+            "wait_events.size: ", wait_events.size(), ", rank: ", rank, ", peer_rank: ", peer_rank);
+    }
+
+    ZE_CALL(zeEventHostSignal, (signal_event));
+
+    status = ccl_sched_entry_status_started;
+    LOG_DEBUG("start completed");
+}
+
+void ze_barrier_entry::update() {
+    for (size_t event_idx = last_completed_event_idx; event_idx < wait_events.size(); event_idx++) {
+        ze_event_handle_t event = wait_events[event_idx].second;
+        CCL_THROW_IF_NOT(event, "event is not available");
+        if (ze_base_entry::is_event_completed(event)) {
+            last_completed_event_idx++;
+            if (last_completed_event_idx == wait_events.size()) {
+                LOG_DEBUG("event is completed. last_completed_event_idx: ",
+                          last_completed_event_idx);
+                status = ccl_sched_entry_status_complete;
+                return;
+            }
+        }
+    }
+}
diff --git a/src/sched/entry/ze/ze_barrier_entry.hpp b/src/sched/entry/ze/ze_barrier_entry.hpp
new file mode 100644
index 000000000..cdd763b7a
--- /dev/null
+++ b/src/sched/entry/ze/ze_barrier_entry.hpp
@@ -0,0 +1,68 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/factory/entry_factory.hpp"
+
+#include <ze_api.h>
+
+class ze_barrier_entry : public sched_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_BARRIER";
+    }
+
+    const char* name() const noexcept override {
+        return class_name();
+    }
+
+    ze_barrier_entry() = delete;
+    explicit ze_barrier_entry(ccl_sched* sched,
+                              ccl_comm* comm,
+                              ze_event_pool_handle_t& local_pool,
+                              size_t event_idx);
+    ~ze_barrier_entry();
+
+    void start() override;
+    void update() override;
+
+    void finalize() override;
+
+protected:
+    void dump_detail(std::stringstream& str) const override {
+        ccl_logger::format(str,
+                           "rank ",
+                           rank,
+                           ", comm_size ",
+                           comm_size,
+                           ", comm_id ",
+                           sched->get_comm_id(),
+                           "wait_events: ",
+                           wait_events.size(),
+                           "\n");
+    }
+
+private:
+    ccl_comm* comm;
+    const int rank;
+    const int comm_size;
+    size_t last_completed_event_idx{};
+    size_t event_idx{};
+
+    ze_event_pool_handle_t local_pool{};
+    ze_event_handle_t signal_event{};
+    std::vector<std::pair<int, ze_event_handle_t>> wait_events{};
+};
diff --git a/src/sched/entry/ze/ze_base_entry.cpp b/src/sched/entry/ze/ze_base_entry.cpp
new file mode 100644
index 000000000..8e00dcbc4
--- /dev/null
+++ b/src/sched/entry/ze/ze_base_entry.cpp
@@ -0,0 +1,488 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/stream/stream.hpp"
+#include "sched/queue/queue.hpp"
+
+#include "sched/entry/ze/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_call.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include "common/utils/sycl_utils.hpp"
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_base_entry::ze_base_entry(ccl_sched *sched,
+                             init_mode mode,
+                             ccl_comm *comm,
+                             uint32_t add_event_count,
+                             std::vector<ze_event_handle_t> wait_events)
+        : sched_entry(sched),
+          mode(mode),
+          comm(comm),
+          use_single_list(sched->get_memory().use_single_list),
+          wait_events(wait_events) {
+    if (!comm) {
+        comm = sched->coll_param.comm;
+    }
+    CCL_THROW_IF_NOT(comm, "no comm");
+    comm_rank = comm->rank();
+    comm_size = comm->size();
+
+    // we can be here in case of copy_entry which may not have ze backend here, so check it
+    if (sched->coll_param.stream &&
+        sched->coll_param.stream->get_backend() == ccl::utils::get_level_zero_backend()) {
+        entry_event = sched->get_memory().event_manager->create();
+        sched->get_memory().ze_entries.push_back(this);
+    }
+    events.resize(add_event_count, nullptr);
+}
+
+ze_base_entry::~ze_base_entry() {
+    finalize();
+}
+
+void ze_base_entry::init() {
+    if (is_initialized) {
+        return;
+    }
+
+    LOG_DEBUG("init");
+
+    worker_idx = sched->queue->get_idx();
+
+    CCL_THROW_IF_NOT(sched->coll_param.stream, "no stream");
+    device = sched->coll_param.stream->get_ze_device();
+    context = sched->coll_param.stream->get_ze_context();
+
+    if (!use_single_list) {
+        /* get queue properties */
+        ze_queue_properties_t queue_props;
+        get_queues_properties(device, &queue_props);
+
+        if ((queue_props.size() == 1) && (queue_props[0].numQueues == 1)) { // magic index?
+            CCL_THROW_IF_NOT(sched->coll_param.stream->get_device_family() ==
+                             ccl::device_family::unknown);
+            LOG_DEBUG("numQueues = 1, switch to compute init mode");
+            mode = init_mode::compute;
+        }
+
+        /* init compute queue and list */
+        if (init_mode::compute & mode) {
+            LOG_DEBUG("compute init mode is enabled");
+            get_comp_primitives(queue_props, comp_primitives);
+            init_primitives(comp_primitives);
+        }
+
+        /* init copy queue and list */
+        if (init_mode::copy & mode) {
+            LOG_DEBUG("copy init mode is enabled");
+            get_copy_primitives(queue_props, copy_primitives, mode);
+            init_primitives(copy_primitives);
+        }
+    }
+
+    /* create event pool */
+    if (events.size() > 0) {
+        event_pool_desc = default_event_pool_desc;
+        event_pool_desc.count = events.size();
+        if (ccl::global_data::env().enable_kernel_profile) {
+            event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
+        }
+        global_data::get().ze_cache->get(worker_idx, context, event_pool_desc, &event_pool);
+        LOG_DEBUG("get event pool: { max event count: ", event_pool_desc.count, " }");
+    }
+
+    append_wait_on_events();
+
+    init_ze_hook();
+    if (!use_single_list) {
+        close_lists();
+    }
+
+    is_initialized = true;
+
+    LOG_DEBUG("init completed");
+}
+
+void ze_base_entry::finalize() {
+    if (!is_initialized) {
+        return;
+    }
+
+    LOG_DEBUG("finalize");
+
+    finalize_ze_hook();
+    destroy_events();
+
+    /* event pool */
+    if (event_pool) {
+        global_data::get().ze_cache->push(worker_idx, context, event_pool_desc, event_pool);
+    }
+
+    if (!use_single_list) {
+        if (comp_primitives.list && comp_primitives.queue) {
+            LOG_DEBUG("push to cache compute list and queue");
+            /* list */
+            global_data::get().ze_cache->push(
+                worker_idx, context, device, comp_primitives.list_desc, comp_primitives.list);
+
+            /* queue */
+            global_data::get().ze_cache->push(
+                worker_idx, context, device, comp_primitives.queue_desc, comp_primitives.queue);
+        }
+
+        if (copy_primitives.list && copy_primitives.queue) {
+            LOG_DEBUG("push to cache copy list and queue");
+            /* copy list */
+            global_data::get().ze_cache->push(
+                worker_idx, context, device, copy_primitives.list_desc, copy_primitives.list);
+
+            /* copy queue */
+            global_data::get().ze_cache->push(
+                worker_idx, context, device, copy_primitives.queue_desc, copy_primitives.queue);
+        }
+    }
+
+    is_initialized = false;
+
+    LOG_DEBUG("finalize completed");
+}
+
+void ze_base_entry::init_entries() {
+    auto &entries = sched->get_memory().ze_entries;
+    if (entries.front() == this) {
+        LOG_DEBUG("init ", entries.size(), " entries");
+        for (auto &entry : entries) {
+            entry->init();
+        }
+    }
+}
+
+void ze_base_entry::finalize_entries() {
+    auto &entries = sched->get_memory().ze_entries;
+    if (entries.back() == this) {
+        LOG_DEBUG("finalize ", entries.size(), " entries");
+        for (auto &entry : entries) {
+            entry->finalize();
+        }
+    }
+}
+
+void ze_base_entry::start() {
+    if (global_data::env().enable_kernel_profile) {
+        sched->master_sched->get_kernel_timer().set_kernel_submit_time(
+            calculate_global_time(sched->coll_param.stream->get_ze_device()));
+    }
+
+    if (use_single_list) {
+        init_entries();
+
+        if (sched->get_memory().ze_entries.front() == this) {
+            sched->get_memory().list_manager->execute();
+        }
+    }
+    else {
+        init();
+        reset_events();
+
+        if (comp_primitives.list && comp_primitives.queue) {
+            LOG_DEBUG("execute compute command list");
+            ZE_CALL(zeCommandQueueExecuteCommandLists,
+                    (comp_primitives.queue, 1, &comp_primitives.list, nullptr));
+        }
+
+        if (copy_primitives.list && copy_primitives.queue) {
+            LOG_DEBUG("execute copy command list");
+            ZE_CALL(zeCommandQueueExecuteCommandLists,
+                    (copy_primitives.queue, 1, &copy_primitives.list, nullptr));
+        }
+
+        if (((global_data::env().ze_serialize_mode & ze_call::serialize_mode::block)) != 0) {
+            LOG_DEBUG("wait until command lists are executed");
+            if (copy_primitives.queue)
+                ZE_CALL(zeHostSynchronize, (copy_primitives.queue));
+            if (comp_primitives.queue)
+                ZE_CALL(zeHostSynchronize, (comp_primitives.queue));
+        }
+    }
+
+    status = ccl_sched_entry_status_started;
+}
+
+bool ze_base_entry::is_event_completed(ze_event_handle_t event) {
+    ze_result_t res = zeEventQueryStatus(event);
+    CCL_THROW_IF_NOT(res == ZE_RESULT_SUCCESS || res == ZE_RESULT_NOT_READY,
+                     "unexpected result from zeEventQueryStatus: ",
+                     to_string(res));
+    return (res == ZE_RESULT_SUCCESS);
+}
+
+bool ze_base_entry::is_queue_completed(ze_command_queue_handle_t queue) {
+    ze_result_t res = zeHostSynchronize(queue);
+    CCL_THROW_IF_NOT(res == ZE_RESULT_SUCCESS || res == ZE_RESULT_NOT_READY,
+                     "unexpected result from zeHostSynchronize: ",
+                     to_string(res));
+    return (res == ZE_RESULT_SUCCESS);
+}
+
+void ze_base_entry::update() {
+    bool complete{};
+
+    if (global_data::env().kernel_debug == 0) {
+        complete = is_event_completed(entry_event);
+    }
+    else {
+        bool copy_q_complete{ true };
+        bool comp_q_complete{ true };
+        if (copy_primitives.queue)
+            copy_q_complete = is_queue_completed(copy_primitives.queue);
+        if (comp_primitives.queue)
+            comp_q_complete = complete && is_queue_completed(comp_primitives.queue);
+        complete = copy_q_complete && comp_q_complete;
+    }
+
+    if (complete) {
+        LOG_DEBUG(name(), " entry complete");
+        status = ccl_sched_entry_status_complete;
+
+        if (ccl::global_data::env().enable_kernel_profile) {
+            auto kernel_time = calculate_event_time(entry_event, device);
+
+            // if we run this code, this sched must be a sub-sched of some master sched
+            // so the field must be non null
+            CCL_THROW_IF_NOT(sched->master_sched, "field must be set");
+            sched->master_sched->get_kernel_timer().set_name(name_ext());
+            sched->master_sched->get_kernel_timer().set_kernel_time(kernel_time);
+        }
+
+        if (use_single_list) {
+            reset_events();
+        }
+
+        if (sched->get_memory().ze_entries.back() == this) {
+            LOG_DEBUG("reset sched events\n");
+            sched->get_memory().event_manager->reset();
+        }
+
+        // Finalize must go after all operation with the event because it's destroyed there.
+        if (!sched->coll_attr.to_cache) {
+            if (use_single_list) {
+                finalize_entries();
+            }
+            else {
+                finalize();
+            }
+        }
+    }
+    else {
+        // just return in case if the kernel is not ready yet
+        // will check again on the next iteration
+        return;
+    }
+}
+
+ze_command_list_handle_t ze_base_entry::get_comp_list() {
+    if (use_single_list) {
+        return sched->get_memory().list_manager->get_comp_list();
+    }
+    return comp_primitives.list;
+}
+
+ze_command_list_handle_t ze_base_entry::get_copy_list() {
+    if (use_single_list) {
+        return sched->get_memory().list_manager->get_copy_list();
+    }
+
+    ze_command_list_handle_t list{};
+    if (copy_primitives.list) {
+        list = copy_primitives.list;
+        LOG_DEBUG("copy list is returned");
+    }
+    else if (comp_primitives.list) {
+        list = comp_primitives.list;
+        LOG_DEBUG("compute list is returned");
+    }
+    CCL_THROW_IF_NOT(list, "command list is invalid");
+    return list;
+}
+
+void ze_base_entry::append_wait_on_events() {
+    if (use_single_list && !wait_events.empty()) {
+        if (sched->get_memory().list_manager->can_use_copy_queue() &&
+            (init_mode::copy & mode)) { // to prevent double append
+            ZE_CALL(zeCommandListAppendWaitOnEvents,
+                    (get_copy_list(), wait_events.size(), wait_events.data()));
+        }
+        ZE_CALL(zeCommandListAppendWaitOnEvents,
+                (get_comp_list(), wait_events.size(), wait_events.data()));
+    }
+}
+
+std::string ze_base_entry::name_ext() const {
+    return "[empty]";
+}
+
+void ze_base_entry::get_comp_primitives(const ze_queue_properties_t &queue_props,
+                                        cmd_primitives &comp_primitives) {
+    uint32_t ordinal{}, queue_index{};
+    get_comp_queue_ordinal(device, queue_props, &ordinal);
+    get_queue_index(queue_props, ordinal, 0, &queue_index);
+
+    comp_primitives.queue_desc.ordinal = ordinal;
+    comp_primitives.queue_desc.index = queue_index;
+    comp_primitives.list_desc.commandQueueGroupOrdinal = ordinal;
+}
+
+void ze_base_entry::get_copy_primitives(const ze_queue_properties_t &queue_props,
+                                        cmd_primitives &copy_primitives,
+                                        init_mode mode) {
+    uint32_t ordinal{}, queue_index{};
+    get_copy_queue_ordinal(device, queue_props, &ordinal);
+
+    // TODO: index depends on rank's changing, when > 1 queues are created,
+    // the index is still the same for different queues, that's the issue.
+    // WA is adding optional counter, which says the order number of a queue.
+    // Need to think, how we'd calculate the index for every queue.
+    // Hang in case of CCL_KERNEL_1S_USE_COPY_OPS=1 CCL_ZE_COPY_ENGINE=none
+    if (mode == (init_mode::compute | init_mode::copy)) {
+        get_queue_index(queue_props, ordinal, 1, &queue_index);
+    }
+    else {
+        get_queue_index(queue_props, ordinal, 0, &queue_index);
+    }
+
+    copy_primitives.queue_desc.ordinal = ordinal;
+    copy_primitives.queue_desc.index = queue_index;
+    copy_primitives.list_desc.commandQueueGroupOrdinal = ordinal;
+}
+
+void ze_base_entry::init_primitives(cmd_primitives &cmd_primitives) {
+    global_data::get().ze_cache->get(
+        worker_idx, context, device, cmd_primitives.queue_desc, &cmd_primitives.queue);
+    LOG_DEBUG("get queue: { ordinal: ",
+              cmd_primitives.queue_desc.ordinal,
+              ", index: ",
+              cmd_primitives.queue_desc.index,
+              " }");
+
+    global_data::get().ze_cache->get(
+        worker_idx, context, device, cmd_primitives.list_desc, &cmd_primitives.list);
+    LOG_DEBUG("get list: { ordinal: ", cmd_primitives.list_desc.commandQueueGroupOrdinal, " }");
+}
+
+ze_event_handle_t ze_base_entry::create_event(ze_event_pool_handle_t event_pool,
+                                              ze_event_desc_t event_desc) {
+    ze_event_handle_t event;
+    ZE_CALL(zeEventCreate, (event_pool, &event_desc, &event));
+    return event;
+}
+
+ze_event_handle_t ze_base_entry::create_event() {
+    ze_event_desc_t event_desc{ default_event_desc };
+    event_desc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
+    event_desc.wait = ZE_EVENT_SCOPE_FLAG_DEVICE;
+    event_desc.index = event_counter++;
+    LOG_DEBUG("create event with index ", event_desc.index);
+    CCL_THROW_IF_NOT(event_desc.index < event_pool_desc.count,
+                     ", event creation limit exceeded: ",
+                     event_desc.index,
+                     ", event_pool_desc.count: ",
+                     event_pool_desc.count);
+    CCL_THROW_IF_NOT(event_desc.index < events.size());
+
+    ze_event_handle_t event = create_event(event_pool, event_desc);
+    events[event_desc.index] = event;
+
+    return event;
+}
+
+void ze_base_entry::reset_events() {
+    for (size_t idx = 0; idx < events.size(); idx++) {
+        if (events[idx])
+            ZE_CALL(zeEventHostReset, (events[idx]));
+    }
+}
+
+void ze_base_entry::destroy_events() {
+    for (size_t idx = 0; idx < events.size(); idx++) {
+        if (events[idx])
+            ZE_CALL(zeEventDestroy, (events[idx]));
+    }
+    events.clear();
+}
+
+void ze_base_entry::close_lists() {
+    if (comp_primitives.list)
+        ZE_CALL(zeCommandListClose, (comp_primitives.list));
+    if (copy_primitives.list)
+        ZE_CALL(zeCommandListClose, (copy_primitives.list));
+}
+
+ze_kernel::ze_kernel(ze_module_handle_t module, const std::string &kernel_name, size_t worker_idx)
+        : module(module),
+          kernel_name(kernel_name),
+          worker_idx(worker_idx) {
+    global_data::get().ze_cache->get(worker_idx, module, kernel_name, &kernel);
+    CCL_THROW_IF_NOT(kernel);
+    LOG_DEBUG("get kernel: name: ", kernel_name);
+}
+
+ze_kernel::ze_kernel(ze_kernel &&other) noexcept
+        : module(std::move(other.module)),
+          kernel_name(std::move(other.kernel_name)),
+          worker_idx(std::move(other.worker_idx)),
+          group_count(std::move(other.group_count)),
+          group_size(std::move(other.group_size)),
+          kernel(std::move(other.kernel)) {
+    other.module = nullptr;
+    other.kernel_name.clear();
+    other.worker_idx = 0;
+    other.group_count = { 0, 0, 0 };
+    other.group_size = { 0, 0, 0 };
+    other.kernel = nullptr;
+};
+
+ze_kernel::~ze_kernel() {
+    if (kernel) {
+        global_data::get().ze_cache->push(worker_idx, module, kernel_name, kernel);
+    }
+}
+
+void ze_kernel::set_args(ze_kernel_args_t kernel_args) {
+    LOG_DEBUG("kernel ", kernel, " args:\n", to_string(kernel_args));
+    set_kernel_args(kernel, kernel_args);
+}
+
+void ze_kernel::calculate_group_size(size_t count) {
+    get_suggested_group_size(kernel, count, &group_size);
+    LOG_DEBUG("suggested group size: ", to_string(group_size));
+
+    ZE_CALL(zeKernelSetGroupSize,
+            (kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
+
+    get_suggested_group_count(group_size, count, &group_count);
+    LOG_DEBUG("suggested group count: ", to_string(group_count));
+}
+
+ze_kernel_handle_t ze_kernel::get_kernel() const {
+    return kernel;
+}
+
+const ze_group_count_t *ze_kernel::get_group_count() const {
+    return &group_count;
+}
diff --git a/src/sched/entry/gpu/ze_base_entry.hpp b/src/sched/entry/ze/ze_base_entry.hpp
similarity index 52%
rename from src/sched/entry/gpu/ze_base_entry.hpp
rename to src/sched/entry/ze/ze_base_entry.hpp
index b04d3f4df..ebd2e15a9 100644
--- a/src/sched/entry/gpu/ze_base_entry.hpp
+++ b/src/sched/entry/ze/ze_base_entry.hpp
@@ -35,28 +35,53 @@ class ze_base_entry : public sched_entry {
 public:
     ze_base_entry() = delete;
     ze_base_entry(const ze_base_entry &) = delete;
-    virtual ~ze_base_entry(){};
+    virtual ~ze_base_entry();
+
+    static ze_event_handle_t create_event(ze_event_pool_handle_t event_pool,
+                                          ze_event_desc_t event_desc);
+    static bool is_event_completed(ze_event_handle_t event);
+
+    ze_event_handle_t entry_event{};
 
 protected:
     explicit ze_base_entry(ccl_sched *sched,
+                           init_mode mode = init_mode::compute,
                            ccl_comm *comm = nullptr,
-                           uint32_t add_event_count = 0);
+                           uint32_t add_event_count = 0,
+                           std::vector<ze_event_handle_t> wait_events = {});
+
+    void init() override;
+    void finalize() override;
+
+    /* ze hooks which can be implemented in derived entry */
+    virtual void init_ze_hook(){};
+    virtual void finalize_ze_hook(){};
 
-    void init(init_mode ze_init_mode);
     virtual void start() override;
     virtual void update() override;
-    void finalize();
 
+    void init_entries();
+    void finalize_entries();
+
+    ze_command_list_handle_t get_comp_list();
     ze_command_list_handle_t get_copy_list();
 
+    virtual std::string name_ext() const;
+
     void init_primitives(cmd_primitives &cmd_primitives);
     void get_copy_primitives(const ze_queue_properties_t &queue_props,
                              cmd_primitives &copy_primitives,
-                             init_mode ze_init_mode);
+                             init_mode mode);
     void get_comp_primitives(const ze_queue_properties_t &queue_props,
                              cmd_primitives &comp_primitives);
 
-    ccl_sched *const sched;
+    ze_event_handle_t create_event();
+    void reset_events();
+    void destroy_events();
+
+    void close_lists();
+
+    init_mode mode;
 
     ccl_comm *comm{};
     int comm_rank{};
@@ -66,14 +91,45 @@ class ze_base_entry : public sched_entry {
 
     bool is_initialized{};
 
+    ze_module_handle_t module{};
+
     ze_device_handle_t device{};
     ze_context_handle_t context{};
 
     cmd_primitives comp_primitives{};
     cmd_primitives copy_primitives{};
 
+    const bool use_single_list;
+
+private:
+    uint32_t event_counter{};
     ze_event_pool_desc_t event_pool_desc{};
     ze_event_pool_handle_t event_pool{};
-    ze_event_handle_t entry_event{};
-    const uint32_t add_event_count;
+    std::vector<ze_event_handle_t> events;
+
+    std::vector<ze_event_handle_t> wait_events;
+
+    bool is_queue_completed(ze_command_queue_handle_t queue);
+    void append_wait_on_events();
+};
+
+class ze_kernel {
+public:
+    ze_kernel(ze_module_handle_t module, const std::string &kernel_name, size_t worker_idx = 0);
+    ze_kernel(const ze_kernel &) = delete;
+    ze_kernel(ze_kernel &&other) noexcept;
+    ~ze_kernel();
+
+    void set_args(ze_kernel_args_t kernel_args);
+    void calculate_group_size(size_t elem_count);
+    ze_kernel_handle_t get_kernel() const;
+    const ze_group_count_t *get_group_count() const;
+
+private:
+    ze_module_handle_t module{};
+    std::string kernel_name{};
+    size_t worker_idx{};
+    ze_group_count_t group_count{};
+    ze_group_size_t group_size{};
+    ze_kernel_handle_t kernel{};
 };
diff --git a/src/sched/entry/gpu/ze_cache.cpp b/src/sched/entry/ze/ze_cache.cpp
similarity index 86%
rename from src/sched/entry/gpu/ze_cache.cpp
rename to src/sched/entry/ze/ze_cache.cpp
index ea3b55e87..4a9de974c 100644
--- a/src/sched/entry/gpu/ze_cache.cpp
+++ b/src/sched/entry/ze/ze_cache.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "sched/entry/gpu/ze_cache.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
 
 #include <iterator>
 
@@ -25,7 +25,7 @@ template <class map_t, class... keys_t>
 bool get_from_cache(map_t& cache, typename map_t::mapped_type& object, keys_t... keys) {
     bool success{};
 
-    if (!global_data::env().enable_kernel_cache)
+    if (!global_data::env().enable_ze_cache)
         return success;
 
     typename map_t::key_type key(keys...);
@@ -43,7 +43,7 @@ template <class map_t, class... keys_t>
 bool push_to_cache(map_t& cache, const typename map_t::mapped_type& object, keys_t... keys) {
     bool success{};
 
-    if (!global_data::env().enable_kernel_cache)
+    if (!global_data::env().enable_ze_cache)
         return success;
 
     typename map_t::key_type key(keys...);
@@ -61,45 +61,6 @@ bool push_to_cache(map_t& cache, const typename map_t::mapped_type& object, keys
     return success;
 }
 
-// fence_cache
-fence_cache::~fence_cache() {
-    if (!cache.empty()) {
-        LOG_WARN("fence cache is not empty, size: ", cache.size());
-        clear();
-    }
-}
-
-void fence_cache::clear() {
-    LOG_DEBUG("clear fence cache: size: ", cache.size());
-    for (auto& key_value : cache) {
-        ZE_CALL(zeFenceDestroy, (key_value.second));
-    }
-    cache.clear();
-}
-
-void fence_cache::get(ze_command_queue_handle_t queue,
-                      const ze_fence_desc_t& fence_desc,
-                      ze_fence_handle_t* fence) {
-    CCL_THROW_IF_NOT(queue);
-    CCL_THROW_IF_NOT(fence);
-    if (get_from_cache(cache, *fence, queue)) {
-        ZE_CALL(zeFenceReset, (*fence));
-    }
-    else {
-        ZE_CALL(zeFenceCreate, (queue, &fence_desc, fence));
-    }
-}
-
-void fence_cache::push(ze_command_queue_handle_t queue,
-                       const ze_fence_desc_t& fence_desc,
-                       ze_fence_handle_t fence) {
-    CCL_THROW_IF_NOT(queue);
-    CCL_THROW_IF_NOT(fence);
-    if (!push_to_cache(cache, fence, queue)) {
-        zeFenceDestroy(fence);
-    }
-}
-
 // kernel_cache
 kernel_cache::~kernel_cache() {
     if (!cache.empty()) {
@@ -110,6 +71,7 @@ kernel_cache::~kernel_cache() {
 
 void kernel_cache::clear() {
     LOG_DEBUG("clear kernel cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
     for (auto& key_value : cache) {
         ZE_CALL(zeKernelDestroy, (key_value.second));
     }
@@ -122,6 +84,7 @@ void kernel_cache::get(ze_module_handle_t module,
     CCL_THROW_IF_NOT(module);
     CCL_THROW_IF_NOT(!kernel_name.empty());
     CCL_THROW_IF_NOT(kernel);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!get_from_cache(cache, *kernel, module, kernel_name)) {
         create_kernel(module, kernel_name, kernel);
     }
@@ -133,6 +96,7 @@ void kernel_cache::push(ze_module_handle_t module,
     CCL_THROW_IF_NOT(module);
     CCL_THROW_IF_NOT(!kernel_name.empty());
     CCL_THROW_IF_NOT(kernel);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!push_to_cache(cache, kernel, module, kernel_name)) {
         ZE_CALL(zeKernelDestroy, (kernel));
     }
@@ -148,6 +112,7 @@ list_cache::~list_cache() {
 
 void list_cache::clear() {
     LOG_DEBUG("clear list cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
     for (auto& key_value : cache) {
         ZE_CALL(zeCommandListDestroy, (key_value.second));
     }
@@ -161,6 +126,7 @@ void list_cache::get(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(list);
+    std::lock_guard<std::mutex> lock(mutex);
     if (get_from_cache(
             cache, *list, context, device, list_desc.commandQueueGroupOrdinal, list_desc.flags)) {
         ZE_CALL(zeCommandListReset, (*list));
@@ -177,6 +143,7 @@ void list_cache::push(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(list);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!push_to_cache(
             cache, list, context, device, list_desc.commandQueueGroupOrdinal, list_desc.flags)) {
         ZE_CALL(zeCommandListDestroy, (list));
@@ -193,6 +160,7 @@ queue_cache::~queue_cache() {
 
 void queue_cache::clear() {
     LOG_DEBUG("clear queue cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
     for (auto& key_value : cache) {
         ZE_CALL(zeCommandQueueDestroy, (key_value.second));
     }
@@ -206,6 +174,7 @@ void queue_cache::get(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(queue);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!get_from_cache(cache,
                         *queue,
                         context,
@@ -226,6 +195,7 @@ void queue_cache::push(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(queue);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!push_to_cache(cache,
                        queue,
                        context,
@@ -249,6 +219,7 @@ event_pool_cache::~event_pool_cache() {
 
 void event_pool_cache::clear() {
     LOG_DEBUG("clear event pool cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
     for (auto& key_value : cache) {
         ZE_CALL(zeEventPoolDestroy, (key_value.second));
     }
@@ -260,6 +231,7 @@ void event_pool_cache::get(ze_context_handle_t context,
                            ze_event_pool_handle_t* event_pool) {
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(event_pool);
+    std::lock_guard<std::mutex> lock(mutex);
     // TODO: we can potentially use pool with count >= pool_desc.count
     if (!get_from_cache(cache, *event_pool, context, pool_desc.flags, pool_desc.count)) {
         ZE_CALL(zeEventPoolCreate, (context, &pool_desc, 0, nullptr, event_pool));
@@ -271,6 +243,7 @@ void event_pool_cache::push(ze_context_handle_t context,
                             ze_event_pool_handle_t event_pool) {
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(event_pool);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!push_to_cache(cache, event_pool, context, pool_desc.flags, pool_desc.count)) {
         ZE_CALL(zeEventPoolDestroy, (event_pool));
     }
@@ -286,9 +259,10 @@ device_mem_cache::~device_mem_cache() {
 
 void device_mem_cache::clear() {
     LOG_DEBUG("clear device memory cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
     //for (auto& key_value : cache) {
     // TODO: there is a segfault on this call, when ~cache is invoked w/ or w/0 api cache.
-    // But it passes, when CCL_KERNEL_CACHE=0 (calls of zeMemAllocDevice and ZeMemFree happen on every iteration).
+    // But it passes, when CCL_ZE_CACHE=0 (calls of zeMemAllocDevice and ZeMemFree happen on every iteration).
     // We don't control destroying phase and may be key_value.second (mem_ptr) is already away to free?
     // ZE_CALL(zeMemFree, (std::get<0>(key_value.first), key_value.second))
     //}
@@ -304,6 +278,7 @@ void device_mem_cache::get(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(pptr);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!get_from_cache(cache,
                         *pptr,
                         context,
@@ -325,6 +300,7 @@ void device_mem_cache::push(ze_context_handle_t context,
     CCL_THROW_IF_NOT(context);
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(ptr);
+    std::lock_guard<std::mutex> lock(mutex);
     if (!push_to_cache(cache,
                        ptr,
                        context,
@@ -383,20 +359,14 @@ void module_cache::load(ze_context_handle_t context,
     CCL_THROW_IF_NOT(device);
     CCL_THROW_IF_NOT(!spv_name.empty());
     CCL_THROW_IF_NOT(module);
-    std::string modules_dir = global_data::env().kernel_path;
-    // TODO: remove
-    if (modules_dir.empty()) {
-        std::string ccl_root = getenv("CCL_ROOT");
-        CCL_THROW_IF_NOT(!ccl_root.empty(), "incorrect comm kernels path, CCL_ROOT not found!");
-        modules_dir = ccl_root + "/lib/kernels/";
-    }
-    load_module(modules_dir, spv_name, device, context, module);
+    const std::string& modules_dir = global_data::env().kernel_path;
+    std::string file_path = modules_dir + spv_name;
+    load_module(file_path, device, context, module);
 }
 
 // cache
 cache::~cache() {
     for (size_t i = 0; i < instance_count; ++i) {
-        fences[i].clear();
         kernels[i].clear();
         lists[i].clear();
         queues[i].clear();
diff --git a/src/sched/entry/gpu/ze_cache.hpp b/src/sched/entry/ze/ze_cache.hpp
similarity index 88%
rename from src/sched/entry/gpu/ze_cache.hpp
rename to src/sched/entry/ze/ze_cache.hpp
index 1ee76ddb9..227d127d6 100644
--- a/src/sched/entry/gpu/ze_cache.hpp
+++ b/src/sched/entry/ze/ze_cache.hpp
@@ -15,34 +15,15 @@
 */
 #pragma once
 
+#include "common/log/log.hpp"
 #include "common/utils/hash.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 
 #include <unordered_map>
 
 namespace ccl {
 namespace ze {
 
-class fence_cache {
-public:
-    fence_cache() = default;
-    ~fence_cache();
-
-    void clear();
-
-    void get(ze_command_queue_handle_t queue,
-             const ze_fence_desc_t& fence_desc,
-             ze_fence_handle_t* fence);
-    void push(ze_command_queue_handle_t queue,
-              const ze_fence_desc_t& fence_desc,
-              ze_fence_handle_t fence);
-
-private:
-    using key_t = typename std::tuple<ze_command_queue_handle_t>;
-    using value_t = ze_fence_handle_t;
-    std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
-};
-
 class kernel_cache {
 public:
     kernel_cache() = default;
@@ -57,6 +38,7 @@ class kernel_cache {
     using key_t = typename std::tuple<ze_module_handle_t, std::string>;
     using value_t = ze_kernel_handle_t;
     std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
 };
 
 // TODO: need to improve with ability to save list with commands for specific algo
@@ -81,6 +63,7 @@ class list_cache {
         tuple<ze_context_handle_t, ze_device_handle_t, uint32_t, ze_command_list_flags_t>;
     using value_t = ze_command_list_handle_t;
     std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
 };
 
 class queue_cache {
@@ -109,6 +92,7 @@ class queue_cache {
                                       ze_command_queue_priority_t>;
     using value_t = ze_command_queue_handle_t;
     std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
 };
 
 class event_pool_cache {
@@ -130,6 +114,7 @@ class event_pool_cache {
     using key_t = typename std::tuple<ze_context_handle_t, ze_event_pool_flags_t, uint32_t>;
     using value_t = ze_event_pool_handle_t;
     std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
 };
 
 class device_mem_cache {
@@ -161,6 +146,7 @@ class device_mem_cache {
                                       uint32_t>;
     using value_t = void*;
     std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
 };
 
 class module_cache {
@@ -191,7 +177,6 @@ class cache {
 public:
     cache(size_t instance_count)
             : instance_count(instance_count),
-              fences(instance_count),
               kernels(instance_count),
               lists(instance_count),
               queues(instance_count),
@@ -204,13 +189,6 @@ class cache {
     ~cache();
 
     /* get */
-    void get(size_t instance_idx,
-             ze_command_queue_handle_t queue,
-             const ze_fence_desc_t& fence_desc,
-             ze_fence_handle_t* fence) {
-        fences.at(instance_idx).get(queue, fence_desc, fence);
-    }
-
     void get(size_t instance_idx,
              ze_module_handle_t module,
              const std::string& kernel_name,
@@ -248,7 +226,7 @@ class cache {
              size_t bytes,
              size_t alignment,
              void** pptr) {
-        device_mems.at(instance_idx)
+        device_mems.at(instance_idx % device_mems.size())
             .get(context, device, device_mem_alloc_desc, bytes, alignment, pptr);
     }
 
@@ -260,13 +238,6 @@ class cache {
     }
 
     /* push */
-    void push(size_t instance_idx,
-              ze_command_queue_handle_t queue,
-              const ze_fence_desc_t& fence_desc,
-              ze_fence_handle_t fence) {
-        fences.at(instance_idx).push(queue, fence_desc, fence);
-    }
-
     void push(size_t instance_idx,
               ze_module_handle_t module,
               const std::string& kernel_name,
@@ -304,13 +275,12 @@ class cache {
               size_t bytes,
               size_t alignment,
               void* ptr) {
-        device_mems.at(instance_idx)
+        device_mems.at(instance_idx % device_mems.size())
             .push(context, device, device_mem_alloc_desc, bytes, alignment, ptr);
     }
 
 private:
     const size_t instance_count;
-    std::vector<fence_cache> fences;
     std::vector<kernel_cache> kernels;
     std::vector<list_cache> lists;
     std::vector<queue_cache> queues;
diff --git a/src/sched/entry/gpu/ze_call.cpp b/src/sched/entry/ze/ze_call.cpp
similarity index 95%
rename from src/sched/entry/gpu/ze_call.cpp
rename to src/sched/entry/ze/ze_call.cpp
index a1876b451..4ad61217a 100644
--- a/src/sched/entry/gpu/ze_call.cpp
+++ b/src/sched/entry/ze/ze_call.cpp
@@ -15,7 +15,8 @@
 */
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
-#include "sched/entry/gpu/ze_call.hpp"
+#include "sched/entry/ze/ze_call.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 
 namespace ccl {
 namespace ze {
diff --git a/src/sched/entry/gpu/ze_call.hpp b/src/sched/entry/ze/ze_call.hpp
similarity index 100%
rename from src/sched/entry/gpu/ze_call.hpp
rename to src/sched/entry/ze/ze_call.hpp
diff --git a/src/sched/entry/gpu/ze_copy_entry.cpp b/src/sched/entry/ze/ze_copy_entry.cpp
similarity index 58%
rename from src/sched/entry/gpu/ze_copy_entry.cpp
rename to src/sched/entry/ze/ze_copy_entry.cpp
index ce11d0f59..9bd46309b 100644
--- a/src/sched/entry/gpu/ze_copy_entry.cpp
+++ b/src/sched/entry/ze/ze_copy_entry.cpp
@@ -13,9 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/gpu/ze_copy_entry.hpp"
-
-#include <ze_api.h>
+#include "sched/entry/ze/ze_copy_entry.hpp"
 
 using namespace ccl;
 
@@ -24,30 +22,19 @@ ze_copy_entry::ze_copy_entry(ccl_sched* sched,
                              ccl_buffer out_buf,
                              size_t count,
                              const ccl_datatype& dtype,
-                             copy_attr attr)
-        : ze_base_entry(sched),
+                             const copy_attr& attr,
+                             std::vector<ze_event_handle_t> wait_events)
+        : ze_base_entry(sched, init_mode::copy, nullptr, 1, wait_events),
           sched(sched),
           in_buf(in_buf),
           out_buf(out_buf),
           dtype(dtype),
           attr(attr),
-          buf_size_bytes(dtype.size() * count) {
+          count(count) {
     CCL_THROW_IF_NOT(sched, "no sched");
 }
 
-ze_copy_entry::~ze_copy_entry() {
-    finalize();
-}
-
-void ze_copy_entry::init() {
-    if (ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("initialization");
-
-    ze_base_entry::init(init_mode::copy);
-
+void ze_copy_entry::init_ze_hook() {
     if (attr.peer_rank != ccl_comm::invalid_rank) {
         if (!out_buf) {
             sched->get_memory().handle_manager.get(
@@ -60,40 +47,10 @@ void ze_copy_entry::init() {
         }
     }
 
-    void* dst = out_buf.get_ptr();
+    void* dst = static_cast<char*>(out_buf.get_ptr()) + attr.out_buf_offset * dtype.size();
     void* src = static_cast<char*>(in_buf.get_ptr()) + attr.in_buf_offset * dtype.size();
     ze_command_list_handle_t list = ze_base_entry::get_copy_list();
 
     ZE_CALL(zeCommandListAppendMemoryCopy,
-            (list, dst, src, buf_size_bytes, ze_base_entry::entry_event, 0, nullptr));
-    ZE_CALL(zeCommandListClose, (list));
-
-    LOG_DEBUG("initialization complete");
-}
-
-void ze_copy_entry::start() {
-    init();
-
-    ze_base_entry::start();
-
-    status = ccl_sched_entry_status_started;
-}
-
-void ze_copy_entry::update() {
-    ze_base_entry::update();
-    if (status == ccl_sched_entry_status_complete && !sched->coll_attr.to_cache) {
-        finalize();
-    }
-}
-
-void ze_copy_entry::finalize() {
-    if (!ze_base_entry::is_initialized) {
-        return;
-    }
-
-    LOG_DEBUG("finalization");
-
-    ze_base_entry::finalize();
-
-    LOG_DEBUG("finalization complete");
+            (list, dst, src, dtype.size() * count, ze_base_entry::entry_event, 0, nullptr));
 }
diff --git a/src/sched/entry/gpu/ze_copy_entry.hpp b/src/sched/entry/ze/ze_copy_entry.hpp
similarity index 75%
rename from src/sched/entry/gpu/ze_copy_entry.hpp
rename to src/sched/entry/ze/ze_copy_entry.hpp
index 6052b3875..9cf5bca9c 100644
--- a/src/sched/entry/gpu/ze_copy_entry.hpp
+++ b/src/sched/entry/ze/ze_copy_entry.hpp
@@ -16,10 +16,7 @@
 #pragma once
 
 #include "sched/entry/copy/copy_helper.hpp"
-#include "sched/entry/entry.hpp"
-#include "sched/sched.hpp"
-
-#include "sched/entry/gpu/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
 
 struct copy_attr;
 
@@ -33,18 +30,22 @@ class ze_copy_entry : public ze_base_entry {
         return class_name();
     }
 
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << count;
+        return out.str();
+    }
+
     explicit ze_copy_entry(ccl_sched* sched,
                            ccl_buffer in_buf,
                            ccl_buffer out_buf,
                            size_t count,
                            const ccl_datatype& dtype,
-                           copy_attr attr = {});
-    ~ze_copy_entry();
+                           const copy_attr& attr = {},
+                           std::vector<ze_event_handle_t> wait_events = {});
 
-    void init();
-    void start() override;
-    void update() override;
-    void finalize();
+    void init_ze_hook() override;
 
 private:
     ccl_sched* const sched;
@@ -52,5 +53,5 @@ class ze_copy_entry : public ze_base_entry {
     ccl_buffer out_buf{};
     const ccl_datatype& dtype;
     const copy_attr attr;
-    const size_t buf_size_bytes;
+    const size_t count;
 };
diff --git a/src/sched/entry/gpu/ze_event_signal_entry.cpp b/src/sched/entry/ze/ze_event_signal_entry.cpp
similarity index 55%
rename from src/sched/entry/gpu/ze_event_signal_entry.cpp
rename to src/sched/entry/ze/ze_event_signal_entry.cpp
index 1ff9c3279..a0a2ee453 100644
--- a/src/sched/entry/gpu/ze_event_signal_entry.cpp
+++ b/src/sched/entry/ze/ze_event_signal_entry.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/gpu/ze_event_signal_entry.hpp"
+#include "sched/entry/ze/ze_event_signal_entry.hpp"
 #include "sched/queue/queue.hpp"
 #include "common/utils/sycl_utils.hpp"
 
@@ -24,17 +24,41 @@ ze_event_signal_entry::ze_event_signal_entry(ccl_sched* sched, ccl_master_sched*
     CCL_THROW_IF_NOT(master_sched, "no master_sched");
 }
 
+ze_event_signal_entry::ze_event_signal_entry(ccl_sched* sched, ze_event_handle_t event)
+        : sched_entry(sched),
+          event(event) {
+    CCL_THROW_IF_NOT(sched, "no sched");
+}
+
 void ze_event_signal_entry::start() {
-    LOG_DEBUG("signal event: ", master_sched->get_memory().sync_event);
-    ZE_CALL(zeEventHostSignal, (master_sched->get_memory().sync_event));
+    auto signal_event = (master_sched) ? master_sched->get_memory().sync_event : event;
+    LOG_DEBUG("signal event: ", signal_event);
+    ZE_CALL(zeEventHostSignal, (signal_event));
 
     status = ccl_sched_entry_status_started;
 }
 
-void ze_event_signal_entry::update() {
+void ze_event_signal_entry::handle_sycl_event_status() {
     if (ccl::utils::is_sycl_event_completed(master_sched->get_native_event()) &&
         ccl::utils::is_sycl_event_completed(master_sched->get_sync_event())) {
         LOG_DEBUG("native and sync events are completed");
         status = ccl_sched_entry_status_complete;
+        if (ccl::global_data::env().enable_kernel_profile) {
+            auto native_dev = sched->coll_param.stream->get_ze_device();
+            auto native_event_time = ccl::ze::calculate_event_time(
+                sycl::get_native<ccl::utils::get_level_zero_backend()>(
+                    master_sched->get_native_event()),
+                native_dev);
+            master_sched->get_kernel_timer().set_operation_event_time(native_event_time);
+        }
+    }
+}
+
+void ze_event_signal_entry::update() {
+    if (master_sched) {
+        handle_sycl_event_status();
+    }
+    else {
+        status = ccl_sched_entry_status_complete;
     }
 }
diff --git a/src/sched/entry/gpu/ze_event_signal_entry.hpp b/src/sched/entry/ze/ze_event_signal_entry.hpp
similarity index 83%
rename from src/sched/entry/gpu/ze_event_signal_entry.hpp
rename to src/sched/entry/ze/ze_event_signal_entry.hpp
index 4b9b2c4f4..380f9ecb6 100644
--- a/src/sched/entry/gpu/ze_event_signal_entry.hpp
+++ b/src/sched/entry/ze/ze_event_signal_entry.hpp
@@ -16,9 +16,10 @@
 #pragma once
 
 #include "sched/entry/entry.hpp"
-#include "sched/master_sched.hpp"
 #include "sched/sched.hpp"
 
+class ccl_master_sched;
+
 class ze_event_signal_entry : public sched_entry {
 public:
     static constexpr const char* class_name() noexcept {
@@ -29,17 +30,17 @@ class ze_event_signal_entry : public sched_entry {
         return class_name();
     }
 
-    bool is_strict_order_satisfied() override {
-        return (status >= ccl_sched_entry_status_complete);
-    }
-
     ze_event_signal_entry() = delete;
     explicit ze_event_signal_entry(ccl_sched* sched, ccl_master_sched* master_sched);
+    explicit ze_event_signal_entry(ccl_sched* sched, ze_event_handle_t event);
     ze_event_signal_entry(const ze_event_signal_entry&) = delete;
 
     void start() override;
     void update() override;
 
 private:
-    ccl_master_sched* const master_sched;
+    ccl_master_sched* const master_sched{};
+    const ze_event_handle_t event{};
+
+    void handle_sycl_event_status();
 };
diff --git a/src/sched/entry/gpu/ze_event_wait_entry.cpp b/src/sched/entry/ze/ze_event_wait_entry.cpp
similarity index 53%
rename from src/sched/entry/gpu/ze_event_wait_entry.cpp
rename to src/sched/entry/ze/ze_event_wait_entry.cpp
index c317ed81d..fb27ef4ef 100644
--- a/src/sched/entry/gpu/ze_event_wait_entry.cpp
+++ b/src/sched/entry/ze/ze_event_wait_entry.cpp
@@ -13,38 +13,40 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/gpu/ze_event_wait_entry.hpp"
+#include "sched/entry/ze/ze_event_wait_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 
-#include <ze_api.h>
-
-ze_event_wait_entry::ze_event_wait_entry(ccl_sched* sched, ze_event_handle_t event)
+ze_event_wait_entry::ze_event_wait_entry(ccl_sched* sched,
+                                         std::vector<ze_event_handle_t> wait_events)
         : sched_entry(sched),
-          event(event) {
+          wait_events(wait_events.cbegin(), wait_events.cend()) {
     CCL_THROW_IF_NOT(sched, "no sched");
-    CCL_THROW_IF_NOT(event, "no event");
 }
 
-void ze_event_wait_entry::check_event_status() {
+bool ze_event_wait_entry::check_event_status(ze_event_handle_t event) const {
     auto query_status = zeEventQueryStatus(event);
     if (query_status == ZE_RESULT_SUCCESS) {
-        LOG_DEBUG("event complete");
-        status = ccl_sched_entry_status_complete;
-    }
-    else if (query_status == ZE_RESULT_NOT_READY) {
-        // just return in case if the kernel is not ready yet, will check again on the next iteration
-        return;
+        return true;
     }
-    else {
-        CCL_THROW("error at zeEventQueryStatus");
+    else if (query_status != ZE_RESULT_NOT_READY) {
+        CCL_THROW("ze error at zeEventQueryStatus, code: ", ccl::ze::to_string(query_status));
     }
+
+    return false;
 }
 
 void ze_event_wait_entry::start() {
     LOG_DEBUG("start event waiting");
     status = ccl_sched_entry_status_started;
-    check_event_status();
 }
 
 void ze_event_wait_entry::update() {
-    check_event_status();
+    for (auto it = wait_events.begin(); it != wait_events.end();) {
+        bool is_completed = check_event_status(*it);
+        if (!is_completed) {
+            return;
+        }
+        it = wait_events.erase(it);
+    }
+    status = ccl_sched_entry_status_complete;
 }
diff --git a/src/sched/entry/gpu/ze_event_wait_entry.hpp b/src/sched/entry/ze/ze_event_wait_entry.hpp
similarity index 78%
rename from src/sched/entry/gpu/ze_event_wait_entry.hpp
rename to src/sched/entry/ze/ze_event_wait_entry.hpp
index 23cbb0092..9f16b5ad3 100644
--- a/src/sched/entry/gpu/ze_event_wait_entry.hpp
+++ b/src/sched/entry/ze/ze_event_wait_entry.hpp
@@ -28,17 +28,13 @@ class ze_event_wait_entry : public sched_entry {
         return class_name();
     }
 
-    bool is_strict_order_satisfied() override {
-        return (status >= ccl_sched_entry_status_complete);
-    }
-
-    explicit ze_event_wait_entry(ccl_sched* sched, ze_event_handle_t event);
+    explicit ze_event_wait_entry(ccl_sched* sched, std::vector<ze_event_handle_t> wait_events);
 
     void start() override;
     void update() override;
 
 private:
-    const ze_event_handle_t event;
+    std::list<ze_event_handle_t> wait_events;
 
-    void check_event_status();
+    bool check_event_status(ze_event_handle_t event) const;
 };
diff --git a/src/sched/entry/gpu/ze_handle_exchange_entry.cpp b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
similarity index 87%
rename from src/sched/entry/gpu/ze_handle_exchange_entry.cpp
rename to src/sched/entry/ze/ze_handle_exchange_entry.cpp
index 6911108ca..8f125f7b0 100644
--- a/src/sched/entry/gpu/ze_handle_exchange_entry.cpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
@@ -13,12 +13,11 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "sched/entry/gpu/ze_handle_exchange_entry.hpp"
+#include "sched/entry/ze/ze_handle_exchange_entry.hpp"
 #include "sched/queue/queue.hpp"
-#include "sched/ze_handle_manager.hpp"
+#include "sched/ze/ze_handle_manager.hpp"
 
 #include <arpa/inet.h>
-#include <CL/sycl/backend/level_zero.hpp>
 #include <errno.h>
 #include <fcntl.h>
 #include <netdb.h>
@@ -43,10 +42,15 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
           rank(comm->rank()),
           comm_size(comm->size()),
           skip_rank(skip_rank) {
-    LOG_DEBUG("initialization");
+    LOG_DEBUG("init");
+
     CCL_THROW_IF_NOT(sched, "no sched");
     CCL_THROW_IF_NOT(!in_buffers.empty(), "in_buffers should be non empty");
 
+    if (comm_size == 1) {
+        skip_rank = rank;
+    }
+
     poll_fds.reserve(max_pfds);
 
     handles.resize(comm_size);
@@ -115,7 +119,7 @@ ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
         left_peer_socket_name += std::string("-") + mpi_uuid;
     }
 
-    LOG_DEBUG("initialization complete");
+    LOG_DEBUG("init completed");
 }
 
 ze_handle_exchange_entry::~ze_handle_exchange_entry() {
@@ -127,6 +131,10 @@ void ze_handle_exchange_entry::start() {
     start_buf_idx = start_peer_idx = 0;
     skip_first_send = false;
     status = ccl_sched_entry_status_started;
+
+    if (comm_size == 1) {
+        status = ccl_sched_entry_status_complete;
+    }
 }
 
 void ze_handle_exchange_entry::update() {
@@ -249,6 +257,10 @@ void ze_handle_exchange_entry::update() {
 
     sched->get_memory().handle_manager.set(handles);
 
+    if (ccl::global_data::env().enable_close_fd_wa) {
+        close_sockets();
+    }
+
     status = ccl_sched_entry_status_complete;
 
     LOG_DEBUG("completed: ", name());
@@ -359,6 +371,39 @@ int ze_handle_exchange_entry::connect_call(int sock,
     return 0;
 }
 
+int ze_handle_exchange_entry::check_msg_retval(std::string operation_name,
+                                               ssize_t bytes,
+                                               struct iovec iov,
+                                               struct msghdr msg,
+                                               size_t union_size,
+                                               int sock,
+                                               int fd) {
+    LOG_DEBUG(operation_name,
+              ": ",
+              bytes,
+              ", expected_bytes:",
+              iov.iov_len,
+              ", expected size of cntr_buf: ",
+              union_size,
+              " -> gotten cntr_buf: ",
+              msg.msg_controllen,
+              ", socket: ",
+              sock,
+              ", fd: ",
+              fd);
+    int ret = -1;
+    if (bytes == static_cast<ssize_t>(iov.iov_len)) {
+        ret = 0;
+    }
+    else if (bytes < 0) {
+        ret = -errno;
+    }
+    else {
+        ret = -EIO;
+    }
+    return ret;
+}
+
 void ze_handle_exchange_entry::sendmsg_fd(int sock, int fd, size_t mem_offset) {
     CCL_THROW_IF_NOT(fd > 0, "unexpected fd value");
 
@@ -366,10 +411,14 @@ void ze_handle_exchange_entry::sendmsg_fd(int sock, int fd, size_t mem_offset) {
     iov.iov_base = &mem_offset;
     iov.iov_len = sizeof(size_t);
 
-    char ctrl_buf[CMSG_SPACE(sizeof(fd))]{};
+    union {
+        struct cmsghdr align;
+        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
+    } u;
+
     struct msghdr msg {};
-    msg.msg_control = ctrl_buf;
-    msg.msg_controllen = CMSG_SPACE(sizeof(fd));
+    msg.msg_control = u.cntr_buf;
+    msg.msg_controllen = sizeof(u.cntr_buf);
     msg.msg_iov = &iov;
     msg.msg_iovlen = 1;
 
@@ -380,17 +429,12 @@ void ze_handle_exchange_entry::sendmsg_fd(int sock, int fd, size_t mem_offset) {
     *reinterpret_cast<int*>(CMSG_DATA(cmsg)) = fd;
 
     ssize_t send_bytes = sendmsg(sock, &msg, 0);
-    CCL_THROW_IF_NOT(send_bytes >= 0,
-                     "sendmsg error: ",
-                     send_bytes,
-                     ", socket: ",
-                     sock,
-                     ", fd: ",
-                     fd,
-                     ", from: ",
-                     comm->rank(),
-                     ", errno: ",
-                     strerror(errno));
+    CCL_THROW_IF_NOT(
+        !check_msg_retval("sendmsg", send_bytes, iov, msg, sizeof(u.cntr_buf), sock, fd),
+        ", from: ",
+        comm->rank(),
+        ", errno: ",
+        strerror(errno));
 }
 
 void ze_handle_exchange_entry::recvmsg_fd(int sock, int& fd, size_t& mem_offset) {
@@ -399,28 +443,35 @@ void ze_handle_exchange_entry::recvmsg_fd(int sock, int& fd, size_t& mem_offset)
     iov.iov_base = &buf;
     iov.iov_len = sizeof(size_t);
 
-    char ctrl_buf[CMSG_SPACE(sizeof(int))]{};
+    union {
+        struct cmsghdr align;
+        char cntr_buf[CMSG_SPACE(sizeof(int))]{};
+    } u;
+
     struct msghdr msg {};
-    msg.msg_control = ctrl_buf;
-    msg.msg_controllen = CMSG_SPACE(sizeof(int));
+    msg.msg_control = u.cntr_buf;
+    msg.msg_controllen = sizeof(u.cntr_buf);
     msg.msg_iov = &iov;
     msg.msg_iovlen = 1;
 
     ssize_t recv_bytes = recvmsg(sock, &msg, 0);
-    CCL_THROW_IF_NOT(recv_bytes >= 0,
-                     "recvmsg error: ",
-                     recv_bytes,
-                     ", socket: ",
-                     sock,
-                     ", fd: ",
-                     fd,
-                     ", from: ",
-                     comm->rank(),
-                     ", errno: ",
-                     strerror(errno));
-
-    if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
-        CCL_THROW("control message is truncated");
+    CCL_THROW_IF_NOT(
+        !check_msg_retval("recvmsg", recv_bytes, iov, msg, sizeof(u.cntr_buf), sock, fd),
+        ", from: ",
+        comm->rank(),
+        ", errno: ",
+        strerror(errno));
+
+    if (msg.msg_flags & (MSG_CTRUNC | MSG_TRUNC)) {
+        std::string flag_str = "";
+        if (msg.msg_flags & MSG_CTRUNC) {
+            flag_str += " MSG_CTRUNC";
+        }
+        if (msg.msg_flags & MSG_TRUNC) {
+            flag_str += " MSG_TRUNC";
+        }
+
+        CCL_THROW("control or usual message is truncated:", flag_str);
     }
 
     for (auto cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
diff --git a/src/sched/entry/gpu/ze_handle_exchange_entry.hpp b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
similarity index 91%
rename from src/sched/entry/gpu/ze_handle_exchange_entry.hpp
rename to src/sched/entry/ze/ze_handle_exchange_entry.hpp
index 9fce2662e..94ecebcef 100644
--- a/src/sched/entry/gpu/ze_handle_exchange_entry.hpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
@@ -17,9 +17,9 @@
 
 #include "common/comm/comm.hpp"
 #include "sched/entry/entry.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/sched.hpp"
-#include "sched/ze_handle_manager.hpp"
+#include "sched/ze/ze_handle_manager.hpp"
 
 #include <poll.h>
 #include <sys/un.h>
@@ -47,10 +47,6 @@ class ze_handle_exchange_entry : public sched_entry {
     void start() override;
     void update() override;
 
-    bool is_strict_order_satisfied() noexcept override {
-        return (status >= ccl_sched_entry_status_complete);
-    }
-
 protected:
     void dump_detail(std::stringstream& str) const override {
         ccl_logger::format(str,
@@ -131,6 +127,13 @@ class ze_handle_exchange_entry : public sched_entry {
 
     void sendmsg_call(int sock, int fd, size_t mem_offset);
     void recvmsg_call(int sock, int& fd, size_t& mem_offset);
+    int check_msg_retval(std::string operation_name,
+                         ssize_t bytes,
+                         struct iovec iov,
+                         struct msghdr msg,
+                         size_t union_size,
+                         int sock,
+                         int fd);
 
     using mem_info_t = typename std::pair<void*, size_t>;
     mem_info_t get_mem_info(const void* ptr);
diff --git a/src/sched/entry/ze/ze_onesided_reduce_entry.cpp b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
new file mode 100644
index 000000000..cc290aa42
--- /dev/null
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
@@ -0,0 +1,163 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/stream/stream.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_onesided_reduce_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/queue/queue.hpp"
+
+#include <string>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_onesided_reduce_entry::ze_onesided_reduce_entry(ccl_sched* sched,
+                                                   ccl_buffer send_buf,
+                                                   ccl_buffer recv_buf,
+                                                   size_t cnt,
+                                                   const ccl_datatype& dtype,
+                                                   reduction op,
+                                                   int root,
+                                                   ccl_comm* comm,
+                                                   std::vector<ze_event_handle_t> wait_events)
+        : ze_base_entry(sched,
+                        global_data::env().enable_kernel_1s_copy_ops
+                            ? (init_mode::compute | init_mode::copy)
+                            : init_mode::compute,
+                        comm,
+                        2 /* request additional events */,
+                        wait_events),
+          send_buf(send_buf),
+          recv_buf(recv_buf),
+          cnt(cnt),
+          dtype(dtype),
+          op(op),
+          root(root),
+          buf_size_bytes(dtype.size() * cnt),
+          is_initialized(false),
+          empty_kernel_event(nullptr),
+          empty_kernel(nullptr),
+          empty_kernel_name("empty_kernel") {}
+
+void ze_onesided_reduce_entry::init_ze_hook() {
+    CCL_THROW_IF_NOT(comm_rank == root, "unexpected comm_rank ", comm_rank, ", expected ", root);
+
+    /* create kernels */
+    ccl_buffer right_send_buf;
+    int peer_rank = (comm_rank + 1) % comm_size;
+    sched->get_memory().handle_manager.get(peer_rank, 0, right_send_buf, comm);
+    LOG_DEBUG(
+        "get IPC pointers from ", peer_rank, " by ", root, ", right_send_buf: ", right_send_buf);
+
+    send_buf_ptr = send_buf.get_ptr();
+    recv_buf_ptr = recv_buf.get_ptr();
+    // TODO: in place case check! diff idx for handle_mngr
+
+    right_send_buf_ptr = right_send_buf.get_ptr();
+
+    ccl::alloc_param alloc_param(buf_size_bytes, buffer_type::ze, buffer_place::device);
+    void* tmp_buf_ptr = sched->alloc_buffer(alloc_param).get_ptr();
+
+    ze_kernel_args_t reduce_local_kernel_args{ &comm_rank,    &comm_size,   &cnt,
+                                               &send_buf_ptr, &tmp_buf_ptr, &recv_buf_ptr };
+
+    ccl::global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
+
+    main_kernel_name =
+        "reduce_local_outofplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
+    LOG_DEBUG("get kernel: name: ", main_kernel_name);
+    ccl::global_data::get().ze_cache->get(worker_idx, module, main_kernel_name, &main_kernel);
+
+    LOG_DEBUG("kernel ", main_kernel, " args:\n", to_string(reduce_local_kernel_args));
+    set_kernel_args(main_kernel, reduce_local_kernel_args);
+
+    ze_group_size_t group_size;
+    get_suggested_group_size(main_kernel, cnt, &group_size);
+    LOG_DEBUG("suggested group size: ", to_string(group_size));
+
+    get_suggested_group_count(group_size, cnt, &group_count);
+    LOG_DEBUG("suggested group count: ", to_string(group_count));
+
+    ZE_CALL(zeKernelSetGroupSize,
+            (main_kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
+
+    if (ccl::global_data::env().enable_kernel_1s_ipc_wa) {
+        LOG_DEBUG("get kernel: name: ", empty_kernel_name);
+        ccl::global_data::get().ze_cache->get(worker_idx, module, empty_kernel_name, &empty_kernel);
+        CCL_THROW_IF_NOT(empty_kernel, "null empty_kernel");
+        /* use allreduce_kernel_args since they have pointers to peer mem */
+        set_kernel_args(empty_kernel, reduce_local_kernel_args);
+    }
+
+    if (empty_kernel) {
+        empty_kernel_event = ze_base_entry::create_event();
+    }
+
+    copy_from_peer_event = ze_base_entry::create_event();
+
+    /* do appends */
+    if (empty_kernel) {
+        LOG_DEBUG("append empty kernel");
+        ze_group_count_t empty_group_count = { 1, 1, 1 };
+        ZE_CALL(
+            zeCommandListAppendLaunchKernel,
+            (get_comp_list(), empty_kernel, &empty_group_count, empty_kernel_event, 0, nullptr));
+    }
+
+    LOG_DEBUG("one-sided multi-phase algorithm");
+
+    ZE_CALL(zeCommandListAppendMemoryCopy,
+            (ze_base_entry::get_copy_list(),
+             tmp_buf_ptr,
+             right_send_buf_ptr,
+             buf_size_bytes,
+             copy_from_peer_event,
+             (empty_kernel_event) ? 1 : 0,
+             &empty_kernel_event));
+
+    ZE_CALL(zeCommandListAppendLaunchKernel,
+            (get_comp_list(), main_kernel, &group_count, entry_event, 1, &copy_from_peer_event));
+}
+
+void ze_onesided_reduce_entry::finalize_ze_hook() {
+    if (empty_kernel_event) {
+        ccl::global_data::get().ze_cache->push(worker_idx, module, empty_kernel_name, empty_kernel);
+    }
+    ccl::global_data::get().ze_cache->push(worker_idx, module, main_kernel_name, main_kernel);
+}
+
+void ze_onesided_reduce_entry::start() {
+    size_t kernel_counter = 0;
+    if (ccl::global_data::env().enable_kernel_sync) {
+        kernel_counter = ccl::global_data::get().kernel_counter++;
+    }
+
+    if (kernel_counter == 0) {
+        ze_base_entry::start();
+    }
+    else {
+        ccl::global_data::get().kernel_counter--;
+        status = ccl_sched_entry_status_again;
+    }
+}
+
+void ze_onesided_reduce_entry::update() {
+    ze_base_entry::update();
+
+    if (ccl::global_data::env().enable_kernel_sync && ccl::global_data::get().kernel_counter > 0) {
+        ccl::global_data::get().kernel_counter--;
+    }
+}
diff --git a/src/sched/entry/gpu/ze_reduce_entry.hpp b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
similarity index 70%
rename from src/sched/entry/gpu/ze_reduce_entry.hpp
rename to src/sched/entry/ze/ze_onesided_reduce_entry.hpp
index 48f45eeb6..254796adb 100644
--- a/src/sched/entry/gpu/ze_reduce_entry.hpp
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
@@ -17,43 +17,47 @@
 
 #include "common/utils/buffer.hpp"
 #include "comp/comp.hpp"
-#include "sched/entry/gpu/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
 
 #include <atomic>
 #include <sstream>
 
-class ze_reduce_entry : public ze_base_entry {
+class ze_onesided_reduce_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
-        return "ZE_REDUCE";
+        return "ZE_1S_REDUCE";
     }
 
     const char* name() const noexcept override {
         return class_name();
     }
 
-    ze_reduce_entry() = delete;
-    explicit ze_reduce_entry(ccl_sched* sched,
-                             ccl_buffer send_buf,
-                             ccl_buffer recv_buf,
-                             size_t cnt,
-                             const ccl_datatype& dtype,
-                             ccl::reduction op,
-                             int root,
-                             ccl_comm* comm);
-    ~ze_reduce_entry();
-
-    void init();
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "size: " << cnt;
+        return out.str();
+    }
+
+    ze_onesided_reduce_entry() = delete;
+    explicit ze_onesided_reduce_entry(ccl_sched* sched,
+                                      ccl_buffer send_buf,
+                                      ccl_buffer recv_buf,
+                                      size_t cnt,
+                                      const ccl_datatype& dtype,
+                                      ccl::reduction op,
+                                      int root,
+                                      ccl_comm* comm,
+                                      std::vector<ze_event_handle_t> wait_events = {});
+
+    void init_ze_hook() override;
+    void finalize_ze_hook() override;
+
     void start() override;
     void update() override;
-    void finalize();
 
     void reset_sync_objects();
 
-    bool is_strict_order_satisfied() override {
-        return (status >= ccl_sched_entry_status_complete);
-    }
-
 protected:
     void dump_detail(std::stringstream& str) const override {
         ccl_logger::format(str,
@@ -80,7 +84,6 @@ class ze_reduce_entry : public ze_base_entry {
     void* send_buf_ptr;
     void* recv_buf_ptr;
     void* right_send_buf_ptr;
-    void* tmp_buf_ptr;
     const unsigned long cnt;
     const ccl_datatype dtype;
     const ccl::reduction op;
@@ -91,8 +94,6 @@ class ze_reduce_entry : public ze_base_entry {
     ze_event_handle_t empty_kernel_event;
     ze_event_handle_t copy_from_peer_event;
 
-    ze_module_handle_t module;
-
     ze_group_count_t group_count;
 
     ze_kernel_handle_t main_kernel;
diff --git a/src/sched/entry/gpu/ze_primitives.cpp b/src/sched/entry/ze/ze_primitives.cpp
similarity index 71%
rename from src/sched/entry/gpu/ze_primitives.cpp
rename to src/sched/entry/ze/ze_primitives.cpp
index 89104fd7d..d6e3a7a48 100644
--- a/src/sched/entry/gpu/ze_primitives.cpp
+++ b/src/sched/entry/ze/ze_primitives.cpp
@@ -18,30 +18,35 @@
 
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 
 namespace ccl {
 
 namespace ze {
 
-void load_module(std::string dir,
-                 std::string file_name,
+std::string get_build_log_string(ze_module_build_log_handle_t build_log) {
+    size_t log_size{};
+    ZE_CALL(zeModuleBuildLogGetString, (build_log, &log_size, nullptr));
+
+    if (!log_size) {
+        LOG_DEBUG(log_size, "empty build log");
+        return {};
+    }
+
+    std::string log(log_size, '\0');
+    ZE_CALL(zeModuleBuildLogGetString, (build_log, &log_size, const_cast<char*>(log.data())));
+    return log;
+}
+
+void load_module(const std::string& file_path,
                  ze_device_handle_t device,
                  ze_context_handle_t context,
                  ze_module_handle_t* module) {
-    LOG_DEBUG("module loading started: directory: ", dir, ", file: ", file_name);
+    LOG_DEBUG("module loading started: file: ", file_path);
+    CCL_THROW_IF_NOT(!file_path.empty(), "no file");
 
-    if (!dir.empty()) {
-        if (*dir.rbegin() != '/') {
-            dir += '/';
-        }
-    }
-
-    std::string file_path = dir + file_name;
     std::ifstream file(file_path, std::ios_base::in | std::ios_base::binary);
-    if (!file.good() || dir.empty() || file_name.empty()) {
-        CCL_THROW("failed to load module: file: ", file_path);
-    }
+    CCL_THROW_IF_NOT(file.good(), "failed to load module: file: ", file_path);
 
     file.seekg(0, file.end);
     size_t filesize = file.tellg();
@@ -51,13 +56,22 @@ void load_module(std::string dir,
     file.read(reinterpret_cast<char*>(module_data.data()), filesize);
     file.close();
 
-    ze_module_desc_t desc = {};
+    ze_module_build_log_handle_t build_log{};
+    ze_module_desc_t desc{};
     ze_module_format_t format = ZE_MODULE_FORMAT_IL_SPIRV;
     desc.format = format;
     desc.pInputModule = reinterpret_cast<const uint8_t*>(module_data.data());
     desc.inputSize = module_data.size();
-    ZE_CALL(zeModuleCreate, (context, device, &desc, module, nullptr));
-    LOG_DEBUG("module loading completed: directory: ", dir, ", file: ", file_name);
+
+    if (zeModuleCreate(context, device, &desc, module, &build_log) != ZE_RESULT_SUCCESS) {
+        CCL_THROW(
+            "failed to create module: ", file_path, ", log: ", get_build_log_string(build_log));
+    }
+    else {
+        LOG_DEBUG("module loading completed: directory: file: ", file_path);
+    }
+
+    zeModuleBuildLogDestroy(build_log);
 }
 
 void create_kernel(ze_module_handle_t module, std::string kernel_name, ze_kernel_handle_t* kernel) {
@@ -72,68 +86,70 @@ void create_kernel(ze_module_handle_t module, std::string kernel_name, ze_kernel
 }
 
 void get_suggested_group_size(ze_kernel_handle_t kernel,
-                              size_t count,
+                              size_t elem_count,
                               ze_group_size_t* group_size) {
-    CCL_ASSERT(count > 0, "count == 0");
+    group_size->groupSizeX = group_size->groupSizeY = group_size->groupSizeZ = 1;
+    if (!elem_count) {
+        return;
+    }
+
     ZE_CALL(zeKernelSuggestGroupSize,
             (kernel,
-             count,
+             elem_count,
              1,
              1,
              &group_size->groupSizeX,
              &group_size->groupSizeY,
              &group_size->groupSizeZ));
+
     CCL_THROW_IF_NOT(group_size->groupSizeX >= 1,
-                     "wrong group size calculation: group size: ",
+                     "wrong group size calculation: size: ",
                      to_string(*group_size),
-                     ", count: ",
-                     count);
+                     ", elem_count: ",
+                     elem_count);
 }
 
 void get_suggested_group_count(const ze_group_size_t& group_size,
-                               size_t count,
+                               size_t elem_count,
                                ze_group_count_t* group_count) {
-    group_count->groupCountX = count / group_size.groupSizeX;
+    group_count->groupCountX = std::max((elem_count / group_size.groupSizeX), 1ul);
     group_count->groupCountY = 1;
     group_count->groupCountZ = 1;
 
-    auto rem = count % group_size.groupSizeX;
+    auto rem = elem_count % group_size.groupSizeX;
     CCL_THROW_IF_NOT(group_count->groupCountX >= 1 && rem == 0,
-                     "wrong group count calculation: group size: ",
+                     "wrong group calculation: size: ",
                      to_string(group_size),
-                     ", group count: ",
-                     to_string(*group_count),
                      ", count: ",
-                     std::to_string(count));
+                     to_string(*group_count),
+                     ", elem_count: ",
+                     std::to_string(elem_count));
 }
 
 void set_kernel_args(ze_kernel_handle_t kernel, const ze_kernel_args_t& kernel_args) {
     uint32_t idx = 0;
     for (const auto& arg : kernel_args) {
-        auto res = zeKernelSetArgumentValue(kernel, idx, arg.first, arg.second);
+        auto res = zeKernelSetArgumentValue(kernel, idx, arg.size, arg.ptr);
         if (res != ZE_RESULT_SUCCESS) {
             CCL_THROW("zeKernelSetArgumentValue failed with error ",
                       to_string(res),
                       " on idx ",
                       idx,
                       " with value ",
-                      *((void**)arg.second));
+                      *((void**)arg.ptr));
         }
         ++idx;
     }
 }
 
-void get_num_queue_groups(ze_device_handle_t device, uint32_t* num) {
-    *num = 0;
-    ZE_CALL(zeDeviceGetCommandQueueGroupProperties, (device, num, nullptr));
-    CCL_THROW_IF_NOT(*num != 0, "no queue groups found");
-}
+void get_queues_properties(ze_device_handle_t device, ze_queue_properties_t* props) {
+    uint32_t queue_group_count = 0;
+    ZE_CALL(zeDeviceGetCommandQueueGroupProperties, (device, &queue_group_count, nullptr));
 
-void get_queues_properties(ze_device_handle_t device,
-                           uint32_t num_queue_groups,
-                           ze_queue_properties_t* props) {
-    props->resize(num_queue_groups);
-    ZE_CALL(zeDeviceGetCommandQueueGroupProperties, (device, &num_queue_groups, props->data()));
+    CCL_THROW_IF_NOT(queue_group_count != 0, "no queue groups found");
+
+    props->resize(queue_group_count);
+    ZE_CALL(zeDeviceGetCommandQueueGroupProperties, (device, &queue_group_count, props->data()));
 }
 
 void get_comp_queue_ordinal(ze_device_handle_t device,
@@ -216,11 +232,76 @@ void get_queue_index(const ze_queue_properties_t& props,
                      int idx,
                      uint32_t* index) {
     CCL_ASSERT(props.size() > ordinal, "props.size() <= ordinal");
+
+    idx += ccl::global_data::env().ze_queue_index;
+
     *index = idx % props[ordinal].numQueues;
     LOG_DEBUG("set queue index: ", *index);
 }
 
-std::string to_string(const ze_result_t result) {
+device_family get_device_family(ze_device_handle_t device) {
+    ze_device_properties_t device_prop;
+    ZE_CALL(zeDeviceGetProperties, (device, &device_prop));
+    uint32_t id = device_prop.deviceId & 0xfff0;
+    using enum_t = typename std::underlying_type<device_family>::type;
+
+    switch (id) {
+        case static_cast<enum_t>(device_id::id1): return device_family::family1;
+        case static_cast<enum_t>(device_id::id2): return device_family::family2;
+        default: return device_family::unknown;
+    }
+}
+
+// adjust the timestamp to a common format
+static uint64_t adjust_device_timestamp(uint64_t timestamp, const ze_device_properties_t& props) {
+    // we have 2 fields that specify the amount of value bits: kernelTimestampValidBits for event
+    // timestamps and timestampValidBits for the global timestamps. In order to compare timestamps
+    // from the different sources we need to truncate the value the lowest number of bits.
+    const uint64_t min_mask = std::min(props.kernelTimestampValidBits, props.timestampValidBits);
+    const uint64_t mask = (1ull << min_mask) - 1ull;
+
+    return (timestamp & mask) * props.timerResolution;
+}
+
+// Returns start and end values for the provided event(measured in ns)
+std::pair<uint64_t, uint64_t> calculate_event_time(ze_event_handle_t event,
+                                                   ze_device_handle_t device) {
+    ze_kernel_timestamp_result_t timestamp = {};
+    ZE_CALL(zeEventQueryKernelTimestamp, (event, &timestamp));
+
+    ze_device_properties_t device_props = {};
+    device_props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    ZE_CALL(zeDeviceGetProperties, (device, &device_props));
+
+    // use global counter as we calculate value across different contexts
+    uint64_t start = timestamp.global.kernelStart;
+    uint64_t end = timestamp.global.kernelEnd;
+
+    // gpu counters might be limited to 32-bit, so we need to handle a potential overlap
+    if (end <= start) {
+        const uint64_t timestamp_max_value = (1LL << device_props.kernelTimestampValidBits) - 1;
+        end += timestamp_max_value - start;
+    }
+
+    start = adjust_device_timestamp(start, device_props);
+    end = adjust_device_timestamp(end, device_props);
+
+    return { start, end };
+}
+
+uint64_t calculate_global_time(ze_device_handle_t device) {
+    uint64_t host_timestamp = 0;
+    uint64_t device_timestamp = 0;
+    ZE_CALL(zeDeviceGetGlobalTimestamps, (device, &host_timestamp, &device_timestamp));
+
+    ze_device_properties_t device_props = {};
+    device_props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    ZE_CALL(zeDeviceGetProperties, (device, &device_props));
+
+    return adjust_device_timestamp(device_timestamp, device_props);
+}
+
+std::string to_string(ze_result_t result) {
     switch (result) {
         case ZE_RESULT_SUCCESS: return "ZE_RESULT_SUCCESS";
         case ZE_RESULT_NOT_READY: return "ZE_RESULT_NOT_READY";
@@ -297,7 +378,7 @@ std::string to_string(const ze_kernel_args_t& kernel_args) {
     for (const auto& arg : kernel_args) {
         // TODO: can we distinguish argument types in order to properly print them instead of printing
         // as a void* ptr?
-        ss << "  idx: " << idx << ", { " << arg.first << ", " << *(void**)arg.second << " }\n";
+        ss << "  idx: " << idx << ", { " << arg.size << ", " << *(void**)arg.ptr << " }\n";
         ++idx;
     }
     ss << "}";
@@ -322,6 +403,20 @@ std::string to_string(const ze_command_queue_group_property_flag_t& flag) {
     }
 }
 
+template <typename T>
+std::string flags_to_string(uint32_t flags) {
+    constexpr size_t bits = 8;
+    std::vector<std::string> output;
+    for (size_t i = 0; i < sizeof(flags) * bits; ++i) {
+        const size_t mask = 1UL << i;
+        const auto flag = flags & mask;
+        if (flag != 0) {
+            output.emplace_back(to_string(static_cast<T>(flag)));
+        }
+    }
+    return join_strings(output, " | ");
+}
+
 std::string to_string(const ze_command_queue_group_properties_t& queue_property) {
     std::stringstream ss;
     ss << "stype: " << queue_property.stype << ", pNext: " << (void*)queue_property.pNext
@@ -343,19 +438,5 @@ std::string join_strings(const std::vector<std::string>& tokens, const std::stri
     return ss.str();
 }
 
-template <typename T>
-std::string flags_to_string(uint32_t flags) {
-    const size_t bits = 8;
-    std::vector<std::string> output;
-    for (size_t i = 0; i < sizeof(flags) * bits; ++i) {
-        const size_t mask = 1UL << i;
-        const auto flag = flags & mask;
-        if (flag != 0) {
-            output.emplace_back(to_string(static_cast<T>(flag)));
-        }
-    }
-    return join_strings(output, " | ");
-}
-
 } // namespace ze
 } // namespace ccl
diff --git a/src/sched/entry/gpu/ze_primitives.hpp b/src/sched/entry/ze/ze_primitives.hpp
similarity index 81%
rename from src/sched/entry/gpu/ze_primitives.hpp
rename to src/sched/entry/ze/ze_primitives.hpp
index b87ed9fd6..8656c8bda 100644
--- a/src/sched/entry/gpu/ze_primitives.hpp
+++ b/src/sched/entry/ze/ze_primitives.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "sched/entry/gpu/ze_call.hpp"
+#include "sched/entry/ze/ze_call.hpp"
 
 #include <initializer_list>
 #include <string>
@@ -33,6 +33,8 @@ enum class init_mode : int {
     copy = 2,
 };
 
+enum class device_id : uint32_t { unknown = 0x0, id1 = 0x200, id2 = 0xbd0 };
+
 constexpr ze_context_desc_t default_context_desc = { .stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC,
                                                      .pNext = nullptr,
                                                      .flags = 0 };
@@ -100,8 +102,7 @@ inline bool operator&(init_mode mode1, init_mode mode2) {
     return static_cast<int>(mode1) & static_cast<int>(mode2);
 }
 
-void load_module(std::string dir,
-                 std::string file_name,
+void load_module(const std::string& file_path,
                  ze_device_handle_t device,
                  ze_context_handle_t context,
                  ze_module_handle_t* module);
@@ -115,21 +116,28 @@ struct ze_group_size_t {
     uint32_t groupSizeZ = 0;
 };
 
-void get_suggested_group_size(ze_kernel_handle_t kernel, size_t count, ze_group_size_t* group_size);
+void get_suggested_group_size(ze_kernel_handle_t kernel,
+                              size_t elem_count,
+                              ze_group_size_t* group_size);
 void get_suggested_group_count(const ze_group_size_t& group_size,
-                               size_t count,
+                               size_t elem_count,
                                ze_group_count_t* group_count);
 
-using ze_kernel_arg_t = std::pair<size_t, const void*>;
-using ze_kernel_args_t = std::initializer_list<ze_kernel_arg_t>;
+struct ze_kernel_arg_t {
+    template <class T>
+    constexpr ze_kernel_arg_t(const T* arg) noexcept
+            : size{ sizeof(T) },
+              ptr{ static_cast<const void*>(arg) } {}
+    const size_t size;
+    const void* ptr;
+};
+
+using ze_kernel_args_t = typename std::initializer_list<ze_kernel_arg_t>;
 void set_kernel_args(ze_kernel_handle_t kernel, const ze_kernel_args_t& kernel_args);
 
-using ze_queue_properties_t = std::vector<ze_command_queue_group_properties_t>;
+using ze_queue_properties_t = typename std::vector<ze_command_queue_group_properties_t>;
 
-void get_num_queue_groups(ze_device_handle_t device, uint32_t* num);
-void get_queues_properties(ze_device_handle_t device,
-                           uint32_t num_queue_groups,
-                           ze_queue_properties_t* props);
+void get_queues_properties(ze_device_handle_t device, ze_queue_properties_t* props);
 void get_comp_queue_ordinal(ze_device_handle_t device,
                             const ze_queue_properties_t& props,
                             uint32_t* ordinal);
@@ -141,7 +149,12 @@ void get_queue_index(const ze_queue_properties_t& props,
                      int idx,
                      uint32_t* index);
 
-std::string to_string(const ze_result_t result);
+device_family get_device_family(ze_device_handle_t device);
+std::pair<uint64_t, uint64_t> calculate_event_time(ze_event_handle_t event,
+                                                   ze_device_handle_t device);
+uint64_t calculate_global_time(ze_device_handle_t device);
+
+std::string to_string(ze_result_t result);
 std::string to_string(const ze_group_size_t& group_size);
 std::string to_string(const ze_group_count_t& group_count);
 std::string to_string(const ze_kernel_args_t& kernel_args);
@@ -150,8 +163,5 @@ std::string to_string(const ze_command_queue_group_properties_t& queue_property)
 
 std::string join_strings(const std::vector<std::string>& tokens, const std::string& delimeter);
 
-template <typename T>
-std::string flags_to_string(uint32_t flags);
-
 } // namespace ze
 } // namespace ccl
diff --git a/src/sched/entry/ze/ze_reduce_local_entry.cpp b/src/sched/entry/ze/ze_reduce_local_entry.cpp
new file mode 100644
index 000000000..6c6a7e842
--- /dev/null
+++ b/src/sched/entry/ze/ze_reduce_local_entry.cpp
@@ -0,0 +1,76 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/ze/ze_reduce_local_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+
+#include <string>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_reduce_local_entry::ze_reduce_local_entry(ccl_sched* sched,
+                                             const ccl_buffer in_buf,
+                                             size_t in_cnt,
+                                             ccl_buffer inout_buf,
+                                             size_t* out_cnt,
+                                             const ccl_datatype& dtype,
+                                             ccl::reduction op)
+        : ze_base_entry(sched, init_mode::compute),
+          in_buf(in_buf),
+          in_cnt(in_cnt),
+          inout_buf(inout_buf),
+          dtype(dtype),
+          op(op) {}
+
+void ze_reduce_local_entry::init_ze_hook() {
+    global_data::get().ze_cache->get(context, device, "kernels.spv", &module);
+
+    kernel_name =
+        "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" + ccl_reduction_to_str(op);
+    global_data::get().ze_cache->get(worker_idx, module, kernel_name, &kernel);
+    LOG_DEBUG("get kernel: name: ", kernel_name);
+
+    ze_group_size_t group_size{};
+    get_suggested_group_size(kernel, in_cnt, &group_size);
+    LOG_DEBUG("suggested group size: ", to_string(group_size));
+
+    ze_group_count_t group_count{};
+    get_suggested_group_count(group_size, in_cnt, &group_count);
+    LOG_DEBUG("suggested group count: ", to_string(group_count));
+
+    ZE_CALL(zeKernelSetGroupSize,
+            (kernel, group_size.groupSizeX, group_size.groupSizeY, group_size.groupSizeZ));
+
+    size_t bytes = in_cnt * dtype.size();
+    void* in_buf_ptr = in_buf.get_ptr(bytes);
+    void* inout_buf_ptr = inout_buf.get_ptr(bytes);
+    ze_kernel_args_t kernel_args{ &in_cnt, &in_buf_ptr, &inout_buf_ptr };
+    LOG_DEBUG("kernel ", kernel, " args:\n", to_string(kernel_args));
+    set_kernel_args(kernel, kernel_args);
+
+    ZE_CALL(zeCommandListAppendLaunchKernel,
+            (ze_base_entry::get_comp_list(),
+             kernel,
+             &group_count,
+             ze_base_entry::entry_event,
+             0,
+             nullptr));
+}
+
+void ze_reduce_local_entry::finalize_ze_hook() {
+    global_data::get().ze_cache->push(worker_idx, module, kernel_name, kernel);
+}
diff --git a/src/sched/entry/ze/ze_reduce_local_entry.hpp b/src/sched/entry/ze/ze_reduce_local_entry.hpp
new file mode 100644
index 000000000..a574dfb12
--- /dev/null
+++ b/src/sched/entry/ze/ze_reduce_local_entry.hpp
@@ -0,0 +1,59 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_reduce_local_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_REDUCE_LOCAL";
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override {
+        std::stringstream out;
+        out << name() << " ";
+        out << "in size: " << in_cnt;
+        return out.str();
+    }
+
+    explicit ze_reduce_local_entry(ccl_sched* sched,
+                                   const ccl_buffer in_buf,
+                                   size_t in_cnt,
+                                   ccl_buffer inout_buf,
+                                   size_t* out_cnt,
+                                   const ccl_datatype& dtype,
+                                   ccl::reduction op);
+
+    void init_ze_hook() override;
+    void finalize_ze_hook() override;
+
+private:
+    const ccl_buffer in_buf;
+    const size_t in_cnt;
+    const ccl_buffer inout_buf;
+    const ccl_datatype dtype;
+    const ccl::reduction op;
+
+    ze_module_handle_t module{};
+    ze_kernel_handle_t kernel{};
+    std::string kernel_name{};
+};
diff --git a/src/sched/extra_sched.hpp b/src/sched/extra_sched.hpp
index 75371de5d..f420ce388 100644
--- a/src/sched/extra_sched.hpp
+++ b/src/sched/extra_sched.hpp
@@ -17,16 +17,17 @@
 
 #include "sched/sched.hpp"
 
+class ccl_master_sched;
+
 class ccl_extra_sched : public ccl_request, public ccl_sched {
 public:
     static constexpr const char* class_name() {
         return "extra_sched";
     }
 
-    ccl_extra_sched(ccl_coll_param& coll_param, ccl_sched_id_t id)
+    ccl_extra_sched(const ccl_sched_create_param& param, ccl_master_sched* master_sched = nullptr)
             : ccl_request(),
-              ccl_sched(coll_param, this) {
-        sched_id = id;
+              ccl_sched(param, this, master_sched) {
 #ifdef ENABLE_DEBUG
         set_dump_callback([this](std::ostream& out) {
             dump(out);
diff --git a/src/sched/gpu_concurrent_sched.cpp b/src/sched/gpu_concurrent_sched.cpp
deleted file mode 100644
index 478f3c68d..000000000
--- a/src/sched/gpu_concurrent_sched.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <functional>
-#include "sched/gpu_concurrent_sched.hpp"
-
-ccl_gpu_concurrent_sched::ccl_gpu_concurrent_sched(
-    size_t expected_threads_count,
-    const ccl_coll_param& coll_param /* = ccl_coll_param()*/)
-        : ccl_sched_base(coll_param),
-          ccl_request(),
-          partial_scheds(expected_threads_count) {
-#ifdef ENABLE_DEBUG
-    set_dump_callback([this](std::ostream& out) {
-        out << "exeecuted: " << this << std::endl; //dump(out);
-    });
-#endif
-}
-
-ccl_gpu_concurrent_sched::~ccl_gpu_concurrent_sched() {}
-
-std::shared_ptr<ccl_gpu_sched> ccl_gpu_concurrent_sched::get_gpu_sched(size_t thread_id) {
-    if (thread_id >= partial_scheds.size()) {
-        LOG_ERROR(
-            "Requested thread id: ", thread_id, " is more than expected: ", partial_scheds.size());
-    }
-    return partial_scheds[thread_id];
-}
-
-std::shared_ptr<ccl_gpu_sched> ccl_gpu_concurrent_sched::create_gpu_sched(
-    size_t thread_id,
-    native::specific_indexed_device_storage& thread_devices,
-    size_t expected_group_size,
-    const ccl_coll_param& coll_param /* = ccl_coll_param()*/) {
-    if (thread_id >= partial_scheds.size()) {
-        LOG_ERROR(
-            "Requested thread id: ", thread_id, " is more than expected: ", partial_scheds.size());
-    }
-
-    //it is safe to create thread ccl_gpu_sched in specific slot (determined by thread_id) in preallocaed partial scheds vector
-    auto sched = std::make_shared<ccl_gpu_sched>(thread_devices, expected_group_size, coll_param);
-    partial_scheds[thread_id] = sched;
-    return sched;
-}
-
-ccl_gpu_concurrent_sched::ccl_gpu_concurrent_sched_ptr ccl_gpu_concurrent_sched::create(
-    size_t thread_count,
-    const ccl_coll_param& param /* = ccl_coll_param()*/) {
-    /*TODO use cache*/
-    return ccl_gpu_concurrent_sched_ptr(new ccl_gpu_concurrent_sched(thread_count, param));
-}
diff --git a/src/sched/gpu_concurrent_sched.hpp b/src/sched/gpu_concurrent_sched.hpp
deleted file mode 100644
index 1f717810b..000000000
--- a/src/sched/gpu_concurrent_sched.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "sched/queue/queue.hpp"
-#include "sched/gpu_sched.hpp"
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-
-class alignas(CACHELINE_SIZE) ccl_gpu_concurrent_sched : public ccl_sched_base, public ccl_request {
-public:
-    static constexpr const char* class_name() {
-        return "gpu_concurrent_sched";
-    }
-
-    using ccl_gpu_concurrent_sched_ptr = std::unique_ptr<ccl_gpu_concurrent_sched>;
-    static ccl_gpu_concurrent_sched_ptr create(size_t thread_count,
-                                               const ccl_coll_param& param = ccl_coll_param());
-
-    ccl_gpu_concurrent_sched(size_t expected_threads_count,
-                             const ccl_coll_param& coll_param = ccl_coll_param());
-    ccl_gpu_concurrent_sched(const ccl_gpu_concurrent_sched& src) = delete;
-    ~ccl_gpu_concurrent_sched();
-
-    std::shared_ptr<ccl_gpu_sched> create_gpu_sched(
-        size_t thread_id,
-        native::specific_indexed_device_storage& thread_devices,
-        size_t expected_group_size,
-        const ccl_coll_param& coll_param = ccl_coll_param());
-
-    std::shared_ptr<ccl_gpu_sched> get_gpu_sched(size_t thread_id);
-
-private:
-    std::vector<std::shared_ptr<ccl_gpu_sched>> partial_scheds;
-};
diff --git a/src/sched/gpu_sched.cpp b/src/sched/gpu_sched.cpp
deleted file mode 100644
index 2080a2f2f..000000000
--- a/src/sched/gpu_sched.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include <unistd.h>
-
-#include "sched/gpu_sched.hpp"
-
-ccl_gpu_sched::ccl_gpu_sched(native::specific_indexed_device_storage& devices,
-                             size_t group_size,
-                             const ccl_coll_param& coll_param /* = ccl_coll_param()*/)
-        : ccl_sched(coll_param, nullptr),
-          ccl_request(),
-          expected_group_size(group_size),
-          group_comm_devices(devices) {
-    //preallocation
-    entry_fences.reserve(expected_group_size);
-    //WHY deque??? entries.reserve(expected_group_size);
-}
-
-void ccl_gpu_sched::complete() {}
-
-ze_fence_handle_t* entry_request = nullptr;
-void ccl_gpu_sched::set_fence(ze_fence_handle_t entry_fence) //TODO temporary
-{
-    assert(entry_fence);
-    entry_fences.push_back(entry_fence);
-}
-
-bool ccl_gpu_sched::wait(size_t nanosec) {
-    if (nanosec > 0) {
-        throw std::runtime_error("nanosec != 0, not yet supported");
-    }
-
-    // TODO: in case we really need to support != 0 case it's probably better to
-    // rework entry->do_progress to work with timeout value
-    for (auto& e : entries) {
-        if (e->get_status() != ccl_sched_entry_status_complete) {
-            return false;
-        }
-    }
-
-    return true;
-}
diff --git a/src/sched/gpu_sched.hpp b/src/sched/gpu_sched.hpp
deleted file mode 100644
index 618510e39..000000000
--- a/src/sched/gpu_sched.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "sched/queue/queue.hpp"
-#include "sched/sched.hpp"
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "common/comm/l0/gpu_device_types.hpp"
-
-class ccl_gpu_sched;
-
-class alignas(CACHELINE_SIZE) ccl_gpu_sched : public ccl_sched, public ccl_request {
-public:
-    static constexpr const char* class_name() {
-        return "gpu_worker_sched";
-    }
-
-    ccl_gpu_sched(native::specific_indexed_device_storage& devices,
-                  size_t expected_group_size,
-                  const ccl_coll_param& coll_param = ccl_coll_param());
-
-    ccl_gpu_sched(const ccl_sched& other) = delete;
-    ccl_gpu_sched& operator=(const ccl_gpu_sched& other) = delete;
-
-    ~ccl_gpu_sched() = default;
-
-    void complete() override;
-    bool wait(size_t nanosec);
-    //TODO temporary
-    void set_fence(ze_fence_handle_t entry_fence);
-
-    template <class device_t>
-    native::indexed_device_container<device_t>& get_devices() {
-        return std::get<device_t::type_idx()>(group_comm_devices);
-    }
-
-    size_t get_group_size() const {
-        return expected_group_size;
-    }
-
-private:
-    size_t expected_group_size;
-    std::vector<ze_fence_handle_t> entry_fences;
-    native::specific_indexed_device_storage& group_comm_devices;
-};
diff --git a/src/sched/master_sched.cpp b/src/sched/master_sched.cpp
index 19d9fac93..83a7f6d2d 100644
--- a/src/sched/master_sched.cpp
+++ b/src/sched/master_sched.cpp
@@ -29,10 +29,10 @@
 #include <CL/sycl.hpp>
 #include <CL/sycl/backend/level_zero.hpp>
 
-#ifdef MULTI_GPU_SUPPORT
-#include "sched/entry/gpu/ze_cache.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
-#endif // MULTI_GPU_SUPPORT
+#ifdef CCL_ENABLE_ZE
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_SYCL
@@ -46,8 +46,8 @@ constexpr ze_event_pool_desc_t get_event_pool_desc() {
 }
 #endif
 
-ccl_master_sched::ccl_master_sched(const ccl_coll_param& coll_param)
-        : ccl_sched_base(coll_param),
+ccl_master_sched::ccl_master_sched(const ccl_sched_create_param& param)
+        : ccl_sched_base(param),
           ccl_request(),
           partial_scheds() {
 #ifdef ENABLE_DEBUG
@@ -56,15 +56,11 @@ ccl_master_sched::ccl_master_sched(const ccl_coll_param& coll_param)
     });
 #endif
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (ccl::utils::should_use_sycl_output_event(coll_param.stream)) {
-        auto l0_context = coll_param.stream->get_native_stream()
-                              .get_context()
-                              .template get_native<cl::sycl::backend::level_zero>();
-
+        auto ze_context = coll_param.stream->get_ze_context();
         auto pool_desc = get_event_pool_desc();
-
-        ccl::global_data::get().ze_cache->get(0, l0_context, pool_desc, &get_memory().sync_pool);
+        ccl::global_data::get().ze_cache->get(0, ze_context, pool_desc, &get_memory().sync_pool);
 
         ze_event_desc_t event_desc = ccl::ze::default_event_desc;
         event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
@@ -87,12 +83,8 @@ ccl_master_sched::~ccl_master_sched() {
     if (!memory.mr_list.empty())
         LOG_WARN("memory region list should be empty for master sched");
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (ccl::utils::should_use_sycl_output_event(coll_param.stream)) {
-        auto l0_context = coll_param.stream->get_native_stream()
-                              .get_context()
-                              .template get_native<cl::sycl::backend::level_zero>();
-
         // Sycl event might call wait on destruction meaning that it should be valid at that time
         // The problem is that the sync event is stored in request, which descrutor is called
         // after ccl_master_sched, which means its underlying l0 event will be already destroyed
@@ -103,9 +95,9 @@ ccl_master_sched::~ccl_master_sched() {
         LOG_DEBUG("destroying sync event: ", get_memory().sync_event);
         ZE_CALL(zeEventDestroy, (get_memory().sync_event));
 
+        auto ze_context = coll_param.stream->get_ze_context();
         auto pool_desc = get_event_pool_desc();
-
-        ccl::global_data::get().ze_cache->push(0, l0_context, pool_desc, get_memory().sync_pool);
+        ccl::global_data::get().ze_cache->push(0, ze_context, pool_desc, get_memory().sync_pool);
     }
     else {
         LOG_DEBUG("skip sync event destruction");
@@ -123,8 +115,9 @@ void ccl_master_sched::commit(ccl_parallelizer* parallelizer) {
         update_id();
         if (parallelizer) {
             parallelizer->process(this);
-            CCL_ASSERT(!partial_scheds.empty(),
-                       "ccl_master_sched must have at least 1 partial sched after parallelized");
+            CCL_THROW_IF_NOT(
+                !partial_scheds.empty(),
+                "ccl_master_sched must have at least 1 partial sched after parallelized");
         }
     }
     else {
@@ -147,7 +140,7 @@ void ccl_master_sched::commit(ccl_parallelizer* parallelizer) {
 void ccl_master_sched::reset_state() {
     reset_request();
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (ccl::utils::should_use_sycl_output_event(coll_param.stream)) {
         // Reset sycl event while it's in complete state, similar case to destruction in ~ccl_master_sched
         set_sync_event(sycl::event());
@@ -159,7 +152,7 @@ void ccl_master_sched::reset_state() {
 
 ccl_request* ccl_master_sched::start(ccl_executor* exec, bool reset_sched) {
     /* sanity check the schedule */
-    CCL_ASSERT(coll_param.comm);
+    CCL_THROW_IF_NOT(coll_param.comm);
 
     LOG_DEBUG("starting schedule ", this, ", type ", ccl_coll_type_to_str(coll_param.ctype));
 
@@ -175,7 +168,7 @@ ccl_request* ccl_master_sched::start(ccl_executor* exec, bool reset_sched) {
         logger.info(ostream.str());
     }
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (ccl::utils::should_use_sycl_output_event(coll_param.stream)) {
         LOG_DEBUG("convert L0 event: ",
                   get_memory().sync_event,
@@ -183,20 +176,19 @@ ccl_request* ccl_master_sched::start(ccl_executor* exec, bool reset_sched) {
         auto q = coll_param.stream->get_native_stream();
         auto context = q.get_context();
 #ifdef CCL_ENABLE_SYCL_INTEROP_EVENT
-        auto e = sycl::level_zero::make<sycl::event>(
-            context, get_memory().sync_event, sycl::level_zero::ownership::keep);
+        auto e = ccl::utils::make_event(context, get_memory().sync_event);
         set_sync_event(e);
-
-        set_native_event(q.submit_barrier({ e }));
-#else
-        CCL_THROW(
-            "interop event functionality is not available with current configuration, please rebuild oneCCL using ENABLE_SYCL_INTEROP_EVENT option"
-            "and a DPCPP compiler that supports that feature");
-#endif
+        set_native_event(ccl::utils::submit_barrier(q, e));
+#else // CCL_ENABLE_SYCL_INTEROP_EVENT
+        CCL_THROW("interop event functionality is not available with current configuration, "
+                  "please rebuild oneCCL using ENABLE_SYCL_INTEROP_EVENT option "
+                  "and a DPCPP compiler that supports that feature");
+#endif // CCL_ENABLE_SYCL_INTEROP_EVENT
     }
-#endif
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     exec->start(this);
+
     return this;
 }
 
@@ -206,8 +198,11 @@ ccl_request* ccl_master_sched::reset_request() {
 }
 
 void ccl_master_sched::add_partial_sched(const ccl_coll_param& coll_param) {
-    partial_scheds.emplace_back(std::make_shared<ccl_sched>(coll_param, this));
-    partial_scheds.back()->internal_type = internal_type;
+    partial_scheds.emplace_back(std::make_shared<ccl_sched>(
+        ccl_sched_create_param(
+            sched_type, coll_param.comm->get_sched_id(sched_type != ccl_sched_regular), coll_param),
+        this,
+        this));
 }
 
 void ccl_master_sched::prepare_partial_scheds() {
@@ -254,7 +249,7 @@ void ccl_master_sched::sync_partial_scheds() {
     if (add_sync_entry) {
         auto sync_obj = std::make_shared<sync_object>(partial_scheds.size());
         for (auto& sched : partial_scheds) {
-            entry_factory::make_entry<sync_entry>(sched.get(), sync_obj);
+            entry_factory::create<sync_entry>(sched.get(), sync_obj);
         }
     }
 }
@@ -284,7 +279,7 @@ ccl_master_sched::ccl_master_sched_ptr ccl_master_sched::create(const ccl_coll_p
     ccl_master_sched_ptr sched;
     bool is_created = false;
     auto create_fn = [param]() -> ccl_master_sched_ptr {
-        return new ccl_master_sched(param);
+        return new ccl_master_sched({ ccl_sched_regular, param.comm->get_sched_id(false), param });
     };
 
     if (attr.to_cache) {
@@ -315,3 +310,19 @@ ccl_master_sched::ccl_master_sched_ptr ccl_master_sched::create(const ccl_coll_p
 
     return sched;
 }
+
+#ifdef CCL_ENABLE_SYCL
+bool ccl_master_sched::print_kernel_timer() const {
+    if (ccl::global_data::env().enable_kernel_profile) {
+        return kernel_timer.print();
+    }
+
+    // if we don't have env variable set, just return false to say that we haven't printed
+    // anything so no more work would be done
+    return false;
+}
+
+void ccl_master_sched::reset_kernel_timer() {
+    kernel_timer.reset();
+}
+#endif // CCL_ENABLE_SYCL
diff --git a/src/sched/master_sched.hpp b/src/sched/master_sched.hpp
index dbcbf6c44..715792230 100644
--- a/src/sched/master_sched.hpp
+++ b/src/sched/master_sched.hpp
@@ -26,7 +26,7 @@ class ccl_master_sched : public ccl_sched_base, public ccl_request {
         return "master_sched";
     }
 
-    ccl_master_sched(const ccl_coll_param& coll_param);
+    ccl_master_sched(const ccl_sched_create_param& param);
 
     ccl_master_sched(const ccl_master_sched& src) = delete;
 
@@ -54,7 +54,20 @@ class ccl_master_sched : public ccl_sched_base, public ccl_request {
     using ccl_master_sched_ptr = ccl_master_sched*;
     static ccl_master_sched_ptr create(const ccl_coll_param& param, const ccl_coll_attr& attr);
 
+#ifdef CCL_ENABLE_SYCL
+    bool print_kernel_timer() const;
+    void reset_kernel_timer();
+
+    ccl::kernel_timer& get_kernel_timer() {
+        return kernel_timer;
+    }
+#endif // CCL_ENABLE_SYCL
+
 private:
     void reset_state();
     void prepare_partial_scheds();
+
+#ifdef CCL_ENABLE_SYCL
+    ccl::kernel_timer kernel_timer;
+#endif // CCL_ENABLE_SYCL
 };
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 834c438d3..10955d712 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -20,9 +20,12 @@
 #include "sched/queue/queue.hpp"
 #include "sched/sched.hpp"
 
-ccl_sched::ccl_sched(const ccl_coll_param& coll_param, ccl_request* master_request)
-        : ccl_sched_base(coll_param) {
-    req = master_request;
+ccl_sched::ccl_sched(const ccl_sched_create_param& param,
+                     ccl_request* master_request,
+                     ccl_master_sched* master_sched)
+        : ccl_sched_base(param),
+          req(master_request),
+          master_sched(master_sched) {
     strict_order = ccl::global_data::env().enable_strict_order;
 }
 
@@ -111,7 +114,7 @@ void ccl_sched::complete() {
                 ss << " count:" << profile_param->get_send_count();
             }
 
-            ss << " time(uses):\ntotal: " << timer.str() << "\n";
+            ss << " time(usec):\ntotal: " << timer.str() << "\n";
             for (size_t idx = 0; idx < entries.size(); ++idx) {
                 ss << "[" << idx << "] " << entries[idx]->name() << ": "
                    << entries[idx]->timer.str() << "\n";
@@ -122,21 +125,40 @@ void ccl_sched::complete() {
     }
 
     if (!coll_attr.to_cache) {
-        /* don't wait sched dtor to free memory */
-        free_memory();
+        /* don't wait sched dtor to clear memory */
+        clear_memory();
     }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    // we keep time measurements in our master sched, if this sched belongs to it
+    // and all the timestamps are ready, print the data.
+    // also check for stream parameter in order to skip non-kernel runs
+    if (ccl::global_data::env().enable_kernel_profile && master_sched && coll_param.stream) {
+        // here we reset the timer every time a corresponding sched is completed
+        // the last one will indicate the actual timestamp(previous ones won't be
+        // printed as not all the measurements are set by that time)
+        master_sched->get_kernel_timer().set_operation_end_time(
+            ccl::ze::calculate_global_time(coll_param.stream->get_ze_device()));
+
+        if (master_sched->print_kernel_timer())
+            master_sched->reset_kernel_timer();
+    }
+#endif // CCL_ENABLE_SYCL
+
     req->complete();
 }
 
-void ccl_sched::renew(bool need_update_id /* = false*/) {
+void ccl_sched::renew(bool need_update_id) {
     if (need_update_id) {
         update_id();
     }
+
+    start_idx = 0;
+
     if (ccl::global_data::env().sched_profile) {
         timer.start();
     }
-    start_idx = 0;
+
     for (size_t idx = 0; idx < entries.size(); idx++) {
         entries[idx].get()->reset(idx);
     }
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index f190cfcaf..936fcc718 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -40,6 +40,7 @@ enum ccl_sched_in_bin_status {
 typedef ccl::status (*ccl_sched_finalize_fn_t)(ccl_sched*, const void*);
 
 class ccl_extra_sched;
+class ccl_master_sched;
 
 class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
 public:
@@ -47,7 +48,9 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
         return "worker_sched";
     }
 
-    ccl_sched(const ccl_coll_param& coll_param, ccl_request* master_request);
+    ccl_sched(const ccl_sched_create_param& param,
+              ccl_request* master_request,
+              ccl_master_sched* master_sched = nullptr);
     ccl_sched() = delete;
     ccl_sched(const ccl_sched& other) = delete;
     ccl_sched& operator=(const ccl_sched& other) = delete;
@@ -171,6 +174,13 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
         finalize_fn_ctx = ctx;
     }
     ccl_request* req = nullptr;
+    // pointer to a master schedule this sched belongs to. If this sched is a partial sched, then
+    // it's the same as the req ptr above. But it's going to be different if the sched is created
+    // as part of extra_sched.
+    // Currently we only set this ptr to non-null when we need it, i.e. these are the cases when we
+    // construct sched and entries to run collective. There are some other cases where we don't need
+    // master_sched, so it's not set there.
+    ccl_master_sched* master_sched = nullptr;
     void dump(std::ostream& out) const;
     size_t entries_count() const;
 
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index ff13786c8..88fbeb311 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -15,25 +15,32 @@
 */
 #include <numeric>
 
-#include "coll/algorithms/algorithms_enum.hpp"
+#include "coll/algorithms/algorithm_utils.hpp"
 #include "coll/coll_param.hpp"
 #include "coll/selection/selection.hpp"
 #include "common/global/global.hpp"
 #include "common/comm/comm.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "sched/buffer_cache.hpp"
+#include "common/utils/sycl_utils.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 #include "sched/sched_base.hpp"
 
-ccl_sched_base::ccl_sched_base(const ccl_coll_param& coll_param) : coll_param(coll_param) {
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-    if (coll_param.stream) {
-        ccl_comm* node_comm =
-            coll_param.comm->get_host_comm()->get_node_comm().get()->get_ccl_comm().get();
+ccl_sched_base::ccl_sched_base(const ccl_sched_create_param& param)
+        : sched_type(param.type),
+          sched_id(param.id),
+          coll_param(param.coll_param) {
+    memory.buffer_manager.init(sched_id);
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (coll_param.stream &&
+        coll_param.stream->get_backend() == ccl::utils::get_level_zero_backend()) {
+        memory.event_manager.reset(new ccl::ze::event_manager(coll_param.stream));
+        auto node_comm = coll_param.comm->get_node_comm().get();
         memory.handle_manager.init(node_comm, coll_param.stream);
+        memory.ipc_event_pool_manager.init(coll_param.stream);
+        memory.list_manager.reset(new ccl::ze::list_manager(coll_param.stream));
     }
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
+
 std::string to_string(ccl_sched_add_mode mode) {
     switch (mode) {
         case ccl_sched_add_front: return "FRONT";
@@ -44,7 +51,7 @@ std::string to_string(ccl_sched_add_mode mode) {
 }
 
 ccl_sched_base::~ccl_sched_base() {
-    free_memory();
+    clear_memory();
 }
 
 void ccl_sched_base::set_coll_attr(const ccl_coll_attr& attr) {
@@ -125,197 +132,83 @@ size_t ccl_sched_base::get_priority() const {
     return priority;
 }
 
-void* ccl_sched_base::alloc_buffer_unmanaged(size_t bytes, ccl_sched_buf_type buf_type) {
-    LOG_DEBUG("try to allocate buffer size: ", bytes);
-    CCL_THROW_IF_NOT(bytes > 0, "incorrect buffer size: ", bytes);
+ccl_buffer ccl_sched_base::alloc_buffer(const ccl::alloc_param& user_param) {
+    ccl::alloc_param param = user_param;
 
-    void* ptr = nullptr;
-    if (buf_type == ccl_sched_buf_system) {
-        ccl::global_data::get().buffer_cache->get(sched_id, bytes, &ptr);
-    }
-#ifdef CCL_ENABLE_SYCL
-    else if (buf_type == ccl_sched_buf_runtime) {
-        CCL_THROW_IF_NOT(coll_param.stream, "null stream");
-        sycl::context ctx = coll_param.stream->get_native_stream().get_context();
-        ccl::global_data::get().buffer_cache->get(sched_id, bytes, ctx, &ptr);
-    }
-#endif // CCL_ENABLE_SYCL
-    else {
-        CCL_THROW("unexpected buf_type ", buf_type);
+    if (!param.stream) {
+        param.stream = coll_param.stream;
     }
 
-    LOG_DEBUG("allocated buffer: ", ptr, ", size: ", bytes);
-    return ptr;
-}
-
-void ccl_sched_base::free_buffer_unmanaged(void* ptr, size_t bytes, ccl_sched_buf_type buf_type) {
-    LOG_DEBUG("free buffer: ", ptr, ", buf_type: ", buf_type);
-
-    if (buf_type == ccl_sched_buf_system) {
-        ccl::global_data::get().buffer_cache->push(sched_id, bytes, ptr);
-    }
 #ifdef CCL_ENABLE_SYCL
-    else if (buf_type == ccl_sched_buf_runtime) {
-        CCL_THROW_IF_NOT(coll_param.stream, "null stream");
-        sycl::context ctx = coll_param.stream->get_native_stream().get_context();
-        ccl::global_data::get().buffer_cache->push(sched_id, bytes, ctx, ptr);
+    if ((param.buf_type == ccl::buffer_type::unknown) && param.stream && param.hint_ptr) {
+        auto ptr_type =
+            sycl::get_pointer_type(param.hint_ptr, param.stream->get_native_stream().get_context());
+        if (ptr_type == sycl::usm::alloc::device) {
+            param.buf_type = ccl::buffer_type::ze;
+            param.buf_place = ccl::buffer_place::device;
+        }
     }
 #endif // CCL_ENABLE_SYCL
-    else {
-        CCL_THROW("unexpected buf_type ", buf_type);
+
+    if (param.buf_type == ccl::buffer_type::unknown) {
+        param.buf_type = ccl::buffer_type::regular;
+        param.buf_place = ccl::buffer_place::host;
     }
+
+    return ccl_buffer(memory.buffer_manager.alloc(param), param.bytes);
 }
 
-ccl_buffer ccl_sched_base::alloc_buffer(size_t bytes, ccl_sched_buf_type buf_type) {
-    ccl_buffer buffer =
-        ccl_buffer(alloc_buffer_unmanaged(bytes, buf_type), bytes, 0, ccl_buffer_type::DIRECT);
+void ccl_sched_base::dealloc_buffer(const ccl::dealloc_param& user_param) {
+    ccl::dealloc_param param = user_param;
 
-    if (buf_type == ccl_sched_buf_system) {
-        memory.buf_list.emplace_back(buffer, bytes);
-    }
 #ifdef CCL_ENABLE_SYCL
-    else if (buf_type == ccl_sched_buf_runtime) {
-        CCL_THROW_IF_NOT(coll_param.stream, "null stream");
-        sycl::context ctx = coll_param.stream->get_native_stream().get_context();
-        memory.sycl_buf_list.emplace_back(buffer, bytes, ctx);
-        LOG_DEBUG(
-            "allocated host usm buffer ptr: ", buffer.get_ptr(), ", size: ", buffer.get_size());
+    if (!param.stream) {
+        param.stream = coll_param.stream;
     }
 #endif // CCL_ENABLE_SYCL
 
-    CCL_THROW_IF_NOT(buffer.get_ptr(), "null ptr");
-    return buffer;
+    memory.buffer_manager.dealloc(param);
 }
 
-#ifdef CCL_ENABLE_SYCL
-ccl_buffer ccl_sched_base::alloc_staging_buffer(size_t bytes) {
-    LOG_DEBUG("try to allocate usm host buffer size: ", bytes);
-    CCL_THROW_IF_NOT(bytes > 0, "incorrect buffer size: ", bytes);
-
-    ccl_sched_buf_type buf_type = ccl_sched_buf_system;
-    if (ccl::global_data::env().staging_buffer == ccl_staging_usm) {
-        buf_type = ccl_sched_buf_runtime;
-    }
-    ccl_buffer buffer = alloc_buffer(bytes, buf_type);
-
-    CCL_THROW_IF_NOT(buffer.get_ptr(), "null ptr");
-
-    return buffer;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+bool ccl_sched_base::enable_ze_single_list() {
+    memory.use_single_list =
+        ccl::global_data::env().enable_ze_single_list &&
+        ccl::global_data::env().kernel_debug == 0 &&
+        ((ccl::global_data::env().ze_serialize_mode & ze_call::serialize_mode::block) == 0) &&
+        !ccl::global_data::env().enable_fusion;
+    return memory.use_single_list;
 }
-#endif // CCL_ENABLE_SYCL
-
-void ccl_sched_base::free_memory() {
-    std::list<ccl_sched_buffer_handler>::iterator it;
-    for (it = memory.buf_list.begin(); it != memory.buf_list.end(); it++) {
-        free_buffer_unmanaged(it->buffer.get_ptr(), it->size, ccl_sched_buf_system);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+void ccl_sched_base::clear_memory() {
+    memory.buffer_manager.clear();
+#ifdef CCL_ENABLE_ZE
+    if (coll_param.stream &&
+        coll_param.stream->get_backend() == ccl::utils::get_level_zero_backend()) {
+        if (memory.event_manager) {
+            memory.event_manager->clear();
+        }
+        memory.handle_manager.clear();
+        memory.ipc_event_pool_manager.clear();
+        if (memory.list_manager) {
+            memory.list_manager->clear();
+        }
+        memory.ze_entries.clear();
     }
-    memory.buf_list.clear();
-
+#endif // CCL_ENABLE_ZE
     free_memory_regions();
-
-#ifdef CCL_ENABLE_SYCL
-    std::list<ccl_sched_sycl_buffer_handler>::iterator sycl_it;
-    for (sycl_it = memory.sycl_buf_list.begin(); sycl_it != memory.sycl_buf_list.end(); sycl_it++) {
-        LOG_DEBUG("free host usm ", sycl_it->buffer.get_ptr());
-        ccl::global_data::get().buffer_cache->push(
-            sched_id, sycl_it->size, sycl_it->ctx, sycl_it->buffer.get_ptr());
-    }
-    memory.sycl_buf_list.clear();
-
-#ifdef MULTI_GPU_SUPPORT
-    memory.handle_manager.clear();
-#endif // MULTI_GPU_SUPPORT
-
-#endif // CCL_ENABLE_SYCL
 }
 
 ccl_buffer ccl_sched_base::update_buffer(ccl_buffer buffer, size_t new_size) {
-    LOG_DEBUG("update pointer data size: ",
-              buffer.get_ptr(),
-              ", from: ",
-              buffer.get_size(),
-              ", to: ",
-              new_size);
-    CCL_THROW_IF_NOT(new_size > 0, "incorrect buffer size: ", new_size);
-
-    /* in case old_ptr will be freed */
-    void* aux_ptr = buffer.get_ptr();
-
-    ccl_buffer new_buf = ccl_buffer(
-        CCL_REALLOC(
-            buffer.get_ptr(), (size_t)buffer.get_size(), new_size, CACHELINE_SIZE, "sched_buffer"),
-        new_size,
-        0,
-        ccl_buffer_type::DIRECT);
-    bool updated = false;
-    for (auto& it : memory.buf_list) {
-        if (it.buffer.get_ptr() == aux_ptr) {
-            /* assign ptr unconditionally, because realloc can return the same pointer */
-            it.buffer = new_buf;
-            it.size = new_size;
-            updated = true;
-            break;
-        }
-    }
-
-    CCL_THROW_IF_NOT(updated, "Cannot update memory in buf_list for addres: ", new_buf.get_ptr());
-    return new_buf;
+    CCL_THROW("unsupported");
+    return ccl_buffer();
 }
 
 ccl_buffer ccl_sched_base::find_and_realloc_buffer(void* in_ptr,
                                                    size_t new_size,
                                                    size_t expected_size) {
-    LOG_DEBUG("sched: ", this, ", contains buffer objects: ", memory.buf_list.size());
-    for (auto& it : memory.buf_list) {
-        if (it.buffer.get_ptr() == in_ptr) {
-#ifdef ENABLE_DEBUG_SPARSE
-            if (expected_size != 0 && (it.buffer.get_size() < expected_size)) {
-                std::stringstream ss;
-                ss << "Unexpected realloc buffer by pointer: " << in_ptr
-                   << ", cur size: " << it.buffer.get_size() << ", to: " << new_size
-                   << ", expected: " << expected_size;
-                ss << "\nbuffers:\n";
-                for (const auto& it : memory.buf_list) {
-                    ss << it.buffer << ", ";
-                }
-                LOG_ERROR(ss.str());
-                CCL_ASSERT(false, ss.str());
-                CCL_THROW_IF_NOT(
-                    false, "Cannot fin buffer by ptr: ", in_ptr, ", available buffers: ", ss.str());
-            }
-#endif //ENABLE_DEBUG_SPARSE
-            if ((it.buffer.get_size() < 0) ||
-                (static_cast<size_t>(it.buffer.get_size()) < new_size)) {
-                LOG_DEBUG("try to realloc buffer by pointer: ",
-                          in_ptr,
-                          ", from: ",
-                          it.buffer.get_size(),
-                          ", to: ",
-                          new_size,
-                          ", expected: ",
-                          expected_size);
-
-                it.buffer = ccl_buffer(CCL_REALLOC(in_ptr,
-                                                   (size_t)it.buffer.get_size(),
-                                                   new_size,
-                                                   CACHELINE_SIZE,
-                                                   "sched_buffer"),
-                                       new_size,
-                                       0,
-                                       ccl_buffer_type::DIRECT);
-                it.size = new_size;
-            }
-            return it.buffer;
-        }
-    }
-
-    /* throw expection */
-    std::stringstream ss;
-    for (const auto& it : memory.buf_list) {
-        ss << it.buffer << ", ";
-    }
-    CCL_THROW_IF_NOT(
-        false, "cannot find buffer by ptr: ", in_ptr, ", available buffers: ", ss.str());
+    CCL_THROW("unsupported");
     return ccl_buffer();
 }
 
@@ -332,10 +225,11 @@ void ccl_sched_base::free_memory_regions() {
     /* perform deregistration in worker thread */
 
     ccl_coll_param param{};
-    param.ctype = ccl_coll_internal;
+    param.ctype = ccl_coll_undefined;
     param.comm = coll_param.comm;
-    std::unique_ptr<ccl_extra_sched> dereg_sched(new ccl_extra_sched(param, sched_id));
-    entry_factory::make_entry<deregister_entry>(dereg_sched.get(), memory.mr_list, param.comm);
+    std::unique_ptr<ccl_extra_sched> dereg_sched(
+        new ccl_extra_sched({ ccl_sched_regular, sched_id, param }));
+    entry_factory::create<deregister_entry>(dereg_sched.get(), memory.mr_list, param.comm);
 
     if (ccl::global_data::get().is_worker_thread || !ccl::global_data::env().worker_offload) {
         dereg_sched->do_progress();
@@ -430,7 +324,6 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
     param.device_send_bufs.clear();
     param.device_recv_bufs.clear();
 
-    // TODO: WA skip sycl pre_post_copy for allreduce gpu algo
     ccl_selector_param selector_param;
     selector_param.ctype = param.ctype;
     selector_param.count = param.get_send_count();
@@ -438,9 +331,10 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
     selector_param.comm = param.comm;
     selector_param.stream = param.stream;
     selector_param.is_sycl_buf = coll_attr.is_sycl_buf;
+    selector_param.recv_counts = param.recv_counts.data();
 
     if (!param.stream || !param.stream->is_sycl_device_stream() ||
-        ccl_is_topo_ring_algo(selector_param)) {
+        ccl_is_device_side_algo(selector_param)) {
         return;
     }
 
@@ -452,7 +346,7 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
             auto usm_type =
                 sycl::get_pointer_type(bufs[0], param.stream->get_native_stream().get_context());
             if ((usm_type == sycl::usm::alloc::host) || (usm_type == sycl::usm::alloc::shared) ||
-                ((usm_type == sycl::usm::alloc::device) && atl_wrapper::attr.out.enable_hmem)) {
+                ((usm_type == sycl::usm::alloc::device) && atl_base_comm::attr.out.enable_hmem)) {
                 should_alloc_buffers = false;
             }
         }
@@ -496,18 +390,26 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
         h2d_counts.clear();
     }
 
+    ccl::buffer_type buf_type = ccl::buffer_type::regular;
+    if (ccl::global_data::env().staging_buffer == ccl_staging_usm) {
+        buf_type = ccl::buffer_type::sycl;
+    }
+    ccl::alloc_param alloc_param(0, buf_type, ccl::buffer_place::host, true, param.stream);
+
     for (size_t idx = 0; idx < d2h_counts.size(); idx++) {
-        if (d2h_counts[idx])
-            param.send_bufs[idx] =
-                alloc_staging_buffer(d2h_counts[idx] * param.dtype.size()).get_ptr();
+        if (d2h_counts[idx]) {
+            alloc_param.bytes = d2h_counts[idx] * param.dtype.size();
+            param.send_bufs[idx] = alloc_buffer(alloc_param).get_ptr();
+        }
         else
             param.send_bufs[idx] = nullptr;
     }
 
     for (size_t idx = 0; idx < h2d_counts.size(); idx++) {
-        if (h2d_counts[idx])
-            param.recv_bufs[idx] =
-                alloc_staging_buffer(h2d_counts[idx] * param.dtype.size()).get_ptr();
+        if (h2d_counts[idx]) {
+            alloc_param.bytes = h2d_counts[idx] * param.dtype.size();
+            param.recv_bufs[idx] = alloc_buffer(alloc_param).get_ptr();
+        }
         else
             param.recv_bufs[idx] = nullptr;
     }
@@ -532,7 +434,7 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
 }
 
 void ccl_sched_base::update_id() {
-    sched_id = coll_param.comm->get_sched_id(internal_type != ccl_sched_internal_none);
+    sched_id = coll_param.comm->get_sched_id(sched_type != ccl_sched_regular);
 }
 
 void ccl_sched_base::dump(std::ostream& out, const char* name) const {
diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp
index 8775a9f84..6d6495e7d 100644
--- a/src/sched/sched_base.hpp
+++ b/src/sched/sched_base.hpp
@@ -19,16 +19,20 @@
 #include <list>
 #include <memory>
 
-#include "atl/atl.h"
-#include "common/comm/atl_tag.hpp"
+#include "atl/atl_base_comm.hpp"
 #include "coll/coll_param.hpp"
+#include "common/comm/atl_tag.hpp"
 #include "common/request/request.hpp"
 #include "common/utils/buffer.hpp"
+#include "sched/buffer/buffer_manager.hpp"
 #include "sched/entry/entry.hpp"
 
-#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
-#include "sched/ze_handle_manager.hpp"
-#endif // CCL_ENABLE_SYCL && MULTI_GPU_SUPPORT
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "sched/ze/ze_event_manager.hpp"
+#include "sched/ze/ze_handle_manager.hpp"
+#include "sched/ze/ze_ipc_event_pool_manager.hpp"
+#include "sched/ze/ze_list_manager.hpp"
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 class ccl_sched_queue;
 class ccl_sched_bin;
@@ -36,11 +40,7 @@ class ccl_request;
 class ccl_parallelizer;
 class ccl_executor;
 
-enum ccl_sched_internal_type {
-    ccl_sched_internal_none,
-    ccl_sched_internal_fusion,
-    ccl_sched_internal_unordered_coll
-};
+enum ccl_sched_type { ccl_sched_regular, ccl_sched_fusion, ccl_sched_unordered_coll };
 
 enum ccl_sched_add_mode {
     ccl_sched_add_front,
@@ -49,48 +49,44 @@ enum ccl_sched_add_mode {
     ccl_sched_add_mode_last_value
 };
 
-enum ccl_sched_buf_type {
-    ccl_sched_buf_system,
-    ccl_sched_buf_runtime,
-
-    ccl_sched_buf_last_value
-};
-
 std::string to_string(ccl_sched_add_mode mode);
 
-struct ccl_sched_buffer_handler {
-    ccl_buffer buffer;
-    size_t size;
-
-    ccl_sched_buffer_handler(ccl_buffer buffer, size_t size) : buffer(buffer), size(size) {}
-};
-
-#ifdef CCL_ENABLE_SYCL
-struct ccl_sched_sycl_buffer_handler : public ccl_sched_buffer_handler {
-    const sycl::context ctx;
-
-    ccl_sched_sycl_buffer_handler(ccl_buffer buffer, size_t size, const sycl::context& ctx)
-            : ccl_sched_buffer_handler(buffer, size),
-              ctx(ctx) {}
-};
-#endif // CCL_ENABLE_SYCL
-
 struct ccl_sched_memory {
-    std::list<ccl_sched_buffer_handler> buf_list;
-    std::list<atl_mr_t*> mr_list;
-
-#ifdef CCL_ENABLE_SYCL
-    std::list<ccl_sched_sycl_buffer_handler> sycl_buf_list;
-#ifdef MULTI_GPU_SUPPORT
+    ccl::buffer_manager buffer_manager;
+#ifdef CCL_ENABLE_ZE
+    std::unique_ptr<ccl::ze::event_manager> event_manager;
     ccl::ze::ipc_handle_manager handle_manager;
+    ccl::ze::ipc_event_pool_manager ipc_event_pool_manager;
+    std::unique_ptr<ccl::ze::list_manager> list_manager;
     // sync event which we use to signal to the user about collective completion
     // and the pool it's created from(need to keep it to know what to return to the cache)
     // TODO: this is not the best place for these objects, think about moving them
     // to ccl_master_sched where they actually used
     ze_event_handle_t sync_event;
     ze_event_pool_handle_t sync_pool;
-#endif // MULTI_GPU_SUPPORT
-#endif // CCL_ENABLE_SYCL
+
+    std::vector<sched_entry*> ze_entries;
+    bool use_single_list{};
+
+#endif // CCL_ENABLE_ZE
+    std::list<atl_mr_t*> mr_list;
+};
+
+struct ccl_sched_create_param {
+    ccl_sched_type type;
+    ccl_sched_id_t id;
+    ccl_coll_param coll_param;
+
+    ccl_sched_create_param(ccl_sched_type type, ccl_sched_id_t id, ccl_coll_param coll_param)
+            : type(type),
+              id(id),
+              coll_param(coll_param) {}
+
+    ccl_sched_create_param(ccl_sched_type type, ccl_coll_param coll_param)
+            : ccl_sched_create_param(type, 0, coll_param) {}
+
+    ccl_sched_create_param(ccl_sched_id_t id, ccl_coll_param coll_param)
+            : ccl_sched_create_param(ccl_sched_regular, id, coll_param) {}
 };
 
 static size_t lifo_priority = 0;
@@ -109,22 +105,15 @@ struct ccl_sched_base {
 
     size_t get_priority() const;
 
-    void* alloc_buffer_unmanaged(size_t bytes, ccl_sched_buf_type buf_type = ccl_sched_buf_system);
-    void free_buffer_unmanaged(void* ptr,
-                               size_t bytes,
-                               ccl_sched_buf_type buf_type = ccl_sched_buf_system);
-
-    ccl_buffer alloc_buffer(size_t bytes, ccl_sched_buf_type buf_type = ccl_sched_buf_system);
-
-#ifdef CCL_ENABLE_SYCL
-    ccl_buffer alloc_staging_buffer(size_t bytes);
-#endif // CCL_ENABLE_SYCL
+    ccl_buffer alloc_buffer(const ccl::alloc_param& param);
+    void dealloc_buffer(const ccl::dealloc_param& param);
 
     void add_memory_region(atl_mr_t* mr);
     void free_memory_regions();
 
-    void free_memory();
+    void clear_memory();
 
+    /* unsupported */
     ccl_buffer update_buffer(ccl_buffer buffer, size_t new_size);
     ccl_buffer find_and_realloc_buffer(void* buffer, size_t new_size, size_t expected_size = 0);
 
@@ -150,18 +139,21 @@ struct ccl_sched_base {
         return memory;
     }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    bool enable_ze_single_list();
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+    ccl_sched_type sched_type = ccl_sched_regular;
+
+    /* sequence number of the schedule in the communicator */
+    ccl_sched_id_t sched_id{};
+
     ccl_coll_param coll_param{};
     ccl_coll_attr coll_attr{};
 
     /* TODO: schedule doesn't necessarily map on single algo */
     ccl_coll_algo hint_algo{};
 
-    /* sequence number of the schedule in the communicator */
-    ccl_sched_id_t sched_id = 0;
-
-    /* whether sched was created by internal module (fusion_manager/unordered_coll_manager) */
-    ccl_sched_internal_type internal_type = ccl_sched_internal_none;
-
     static size_t get_lifo_priority() noexcept {
         return lifo_priority++;
     }
@@ -173,7 +165,7 @@ struct ccl_sched_base {
         CCL_THROW("unsupported");
     }
 
-    ccl_sched_base(const ccl_coll_param& coll_param);
+    ccl_sched_base(const ccl_sched_create_param& param);
 
     void update_id();
 
diff --git a/src/sched/sched_timer.cpp b/src/sched/sched_timer.cpp
index 94141ef45..d2d621ff2 100644
--- a/src/sched/sched_timer.cpp
+++ b/src/sched/sched_timer.cpp
@@ -17,6 +17,7 @@
 #include <numeric>
 #include <sstream>
 
+#include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "sched_timer.hpp"
 
@@ -51,4 +52,162 @@ long double sched_timer::get_time() const noexcept {
     return time_usec;
 }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+
+kernel_timer::kernel_timer()
+        : kernel_time{ get_uninit_values() },
+          operation_event_time{ get_uninit_values() },
+          operation_start_time{ std::numeric_limits<uint64_t>::max() },
+          operation_end_time{ std::numeric_limits<uint64_t>::max() },
+          kernel_submit_time{ std::numeric_limits<uint64_t>::max() } {}
+
+// Returns true if we have all the necessary data to print
+bool kernel_timer::print(bool delay) const {
+    auto is_value_set = [](std::pair<uint64_t, uint64_t> val) {
+        return val != get_uninit_values();
+    };
+
+    auto is_val_set = [](uint64_t val) {
+        return val != std::numeric_limits<uint64_t>::max();
+    };
+
+    // Convert value from ns to usec and format it.
+    auto convert_output = [](uint64_t val) {
+        std::stringstream ss;
+        ss << (val / 1000) << "." << (val % 1000) / 100;
+
+        return ss.str();
+    };
+
+    // currently there are 3 levels:
+    // 0 - profiling and output is disabled
+    // 1 - print time dureation for intervals
+    // 2 - level 1 + raw timestamp that we collected
+    int profile_level = ccl::global_data::env().enable_kernel_profile;
+
+    // Make sure we have all the measurements
+    bool all_measurements_are_ready =
+        is_value_set(kernel_time) && is_val_set(operation_create_time) &&
+        is_val_set(operation_start_time) && is_val_set(operation_end_time) &&
+        is_val_set(kernel_submit_time) && is_val_set(deps_start_time) && is_val_set(deps_end_time);
+
+    // operation_event_time is only required if we use output event, otherwise just
+    // skip it
+    if (ccl::global_data::env().enable_sycl_output_event) {
+        all_measurements_are_ready =
+            all_measurements_are_ready && is_value_set(operation_event_time);
+    }
+
+    if (!all_measurements_are_ready) {
+        // need more data
+        return false;
+    }
+
+    if (delay && all_measurements_are_ready) {
+        // the output will be printed later
+        ccl::global_data::get().timer_printer->add_timer(*this);
+        return true;
+    }
+
+    std::stringstream ss;
+    ss << "kernel: " << name << " time(usec)" << std::endl;
+    if (profile_level > 1) {
+        ss << "timestamps: " << std::endl;
+        ss << "  operation create: " << operation_create_time << std::endl;
+        ss << "  operation start: " << operation_start_time << std::endl;
+        ss << "  deps wait start: " << deps_start_time << std::endl;
+        ss << "  deps wait end: " << deps_end_time << std::endl;
+        ss << "  kernel submit: " << kernel_submit_time << std::endl;
+        ss << "  kernel start: " << kernel_time.first << std::endl;
+        ss << "  kernel end: " << kernel_time.first << std::endl;
+        if (ccl::global_data::env().enable_sycl_output_event) {
+            ss << "  operation event start: " << operation_event_time.first << std::endl;
+            ss << "  operation event end: " << operation_event_time.second << std::endl;
+        }
+        ss << "  operation end: " << operation_end_time << std::endl;
+    }
+
+    ss << "operation: " << convert_output(operation_end_time - operation_create_time) << std::endl;
+    ss << "  api call: " << convert_output(operation_start_time - operation_create_time)
+       << std::endl;
+    ss << "  preparation: " << convert_output(kernel_submit_time - operation_start_time)
+       << std::endl;
+    ss << "    deps handling: " << convert_output(deps_end_time - deps_start_time) << std::endl;
+    ss << "  device scheduling: " << convert_output(kernel_time.first - kernel_submit_time)
+       << std::endl;
+    ss << "  device execution: " << convert_output(kernel_time.second - kernel_time.first)
+       << std::endl;
+    if (ccl::global_data::env().enable_sycl_output_event) {
+        ss << "  event completion: "
+           << convert_output(operation_event_time.second - kernel_time.second) << std::endl;
+        ss << "  completion: " << convert_output(operation_end_time - operation_event_time.second)
+           << std::endl;
+    }
+    else {
+        ss << "  completion: " << convert_output(operation_end_time - kernel_time.second)
+           << std::endl;
+    }
+
+    ss << std::endl;
+
+    std::cout << ss.str() << std::endl;
+
+    return true;
+}
+
+void kernel_timer::reset() {
+    kernel_time = { get_uninit_values() };
+    operation_event_time = { get_uninit_values() };
+    operation_start_time = std::numeric_limits<uint64_t>::max();
+    operation_end_time = std::numeric_limits<uint64_t>::max();
+    kernel_submit_time = std::numeric_limits<uint64_t>::max();
+}
+
+uint64_t kernel_timer::get_current_time() {
+    return std::chrono::time_point_cast<std::chrono::nanoseconds>(
+               std::chrono::high_resolution_clock::now())
+        .time_since_epoch()
+        .count();
+}
+
+void kernel_timer::set_name(const std::string new_name) {
+    name = new_name;
+}
+const std::string& kernel_timer::get_name() const {
+    return name;
+}
+
+void kernel_timer::set_kernel_time(std::pair<uint64_t, uint64_t> val) {
+    kernel_time = val;
+}
+
+void kernel_timer::set_operation_event_time(std::pair<uint64_t, uint64_t> val) {
+    operation_event_time = val;
+}
+
+void kernel_timer::set_operation_create_time(uint64_t val) {
+    operation_create_time = val;
+}
+
+void kernel_timer::set_operation_start_time(uint64_t val) {
+    operation_start_time = val;
+}
+
+void kernel_timer::set_operation_end_time(uint64_t val) {
+    operation_end_time = val;
+}
+
+void kernel_timer::set_deps_start_time(uint64_t val) {
+    deps_start_time = val;
+}
+
+void kernel_timer::set_deps_end_time(uint64_t val) {
+    deps_end_time = val;
+}
+
+void kernel_timer::set_kernel_submit_time(uint64_t val) {
+    kernel_submit_time = val;
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 } // namespace ccl
diff --git a/src/sched/sched_timer.hpp b/src/sched/sched_timer.hpp
index 532a3240e..ce6116d6d 100644
--- a/src/sched/sched_timer.hpp
+++ b/src/sched/sched_timer.hpp
@@ -18,6 +18,10 @@
 #include <chrono>
 #include <string>
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include <ze_api.h>
+#endif
+
 namespace ccl {
 
 class sched_timer {
@@ -36,4 +40,65 @@ class sched_timer {
     long double get_time() const noexcept;
 };
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+class kernel_timer {
+public:
+    kernel_timer();
+    ~kernel_timer() = default;
+
+    kernel_timer(const kernel_timer&) = default;
+
+    void set_name(const std::string name);
+    const std::string& get_name() const;
+
+    void set_kernel_time(std::pair<uint64_t, uint64_t> val);
+    void set_operation_event_time(std::pair<uint64_t, uint64_t> val);
+    void set_operation_create_time(uint64_t val);
+    void set_operation_start_time(uint64_t val);
+    void set_operation_end_time(uint64_t val);
+    void set_deps_start_time(uint64_t val);
+    void set_deps_end_time(uint64_t val);
+    void set_kernel_submit_time(uint64_t val);
+
+    bool print(bool delay = true) const;
+    void reset();
+
+    static uint64_t get_current_time();
+
+private:
+    // Special pair of values that indicate unitialized measurements
+    static constexpr std::pair<uint64_t, uint64_t> get_uninit_values() {
+        return { std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max() };
+    }
+
+    std::string name;
+    // List of timestamps we're collecting
+    std::pair<uint64_t, uint64_t> kernel_time;
+    std::pair<uint64_t, uint64_t> operation_event_time;
+    uint64_t operation_create_time;
+    uint64_t operation_start_time;
+    uint64_t operation_end_time;
+    uint64_t deps_start_time;
+    uint64_t deps_end_time;
+    uint64_t kernel_submit_time;
+};
+
+class kernel_timer_printer {
+public:
+    void add_timer(const kernel_timer& timer) {
+        timers.push_back(timer);
+    }
+
+    ~kernel_timer_printer() {
+        for (auto& t : timers) {
+            t.print(false);
+        }
+    }
+
+private:
+    std::vector<kernel_timer> timers;
+};
+
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 } //namespace ccl
diff --git a/src/sched/ze/ze_event_manager.cpp b/src/sched/ze/ze_event_manager.cpp
new file mode 100644
index 000000000..241d69f62
--- /dev/null
+++ b/src/sched/ze/ze_event_manager.cpp
@@ -0,0 +1,164 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "common/utils/sycl_utils.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/ze/ze_event_manager.hpp"
+
+using namespace ccl;
+using namespace ze;
+
+event_pool::event_pool(ze_context_handle_t context)
+        : context{ context },
+          pool_desc{ event_manager::get_default_event_pool_desc() },
+          event_desc{ event_manager::get_default_event_desc() } {}
+
+event_pool::event_pool(ze_context_handle_t context, const ze_event_pool_desc_t& pool_desc)
+        : context{ context },
+          pool_desc{ pool_desc },
+          event_desc{ event_manager::get_default_event_desc() } {}
+
+event_pool::event_pool(ze_context_handle_t context,
+                       const ze_event_pool_desc_t& pool_desc,
+                       const ze_event_desc_t& event_desc)
+        : context{ context },
+          pool_desc{ pool_desc },
+          event_desc{ event_desc } {}
+
+event_pool::~event_pool() {
+    clear();
+}
+
+event_pool::operator ze_event_pool_handle_t() const {
+    return pool;
+}
+
+ze_event_handle_t event_pool::create_event() {
+    ze_event_desc_t desc = event_desc;
+    return create_event(desc);
+}
+
+ze_event_handle_t event_pool::create_event(ze_event_desc_t& desc) {
+    create_pool();
+    ze_event_handle_t event{};
+    CCL_THROW_IF_NOT(size() < capacity());
+    desc.index = events.size();
+    ZE_CALL(zeEventCreate, (pool, &desc, &event));
+    events.push_back(event);
+
+    return event;
+}
+
+size_t event_pool::size() const {
+    return events.size();
+}
+
+size_t event_pool::capacity() const {
+    return pool_desc.count;
+}
+
+void event_pool::reset() {
+    for (auto& event : events) {
+        ZE_CALL(zeEventHostReset, (event));
+    }
+}
+
+void event_pool::clear() {
+    if (pool) {
+        for (auto& event : events) {
+            ZE_CALL(zeEventDestroy, (event));
+        }
+        events.clear();
+        ccl::global_data::get().ze_cache->push(worker_idx, context, pool_desc, pool);
+        pool = {};
+    }
+}
+
+void event_pool::create_pool() {
+    if (!pool) {
+        ccl::global_data::get().ze_cache->get(worker_idx, context, pool_desc, &pool);
+    }
+}
+
+event_manager::event_manager(ze_context_handle_t context) : context{ context } {
+    CCL_THROW_IF_NOT(context, "no context");
+}
+
+event_manager::event_manager(const ccl_stream* stream) {
+    CCL_THROW_IF_NOT(stream, "no stream");
+    CCL_THROW_IF_NOT(stream->get_backend() == utils::get_level_zero_backend(), "no ze backend");
+    context = stream->get_ze_context();
+}
+
+event_manager::~event_manager() {
+    clear();
+}
+
+ze_event_handle_t event_manager::create(ze_event_desc_t desc) {
+    return create(1, desc).front();
+}
+
+std::vector<ze_event_handle_t> event_manager::create(size_t count, ze_event_desc_t desc) {
+    std::vector<ze_event_handle_t> events(count);
+    if (count <= 0) {
+        return events;
+    }
+
+    if (pools.empty() || (pools.back().size() + count) > pools.back().capacity()) {
+        add_pool();
+    }
+
+    for (auto& event : events) {
+        // TODO: place add_pool
+        event = pools.back().create_event(desc);
+    }
+
+    return events;
+}
+
+void event_manager::reset() {
+    for (auto& pool : pools) {
+        pool.reset();
+    }
+}
+
+void event_manager::clear() {
+    for (auto& pool : pools) {
+        pool.clear();
+    }
+    pools.clear();
+}
+
+ze_event_desc_t event_manager::get_default_event_desc() {
+    ze_event_desc_t desc{ default_event_desc };
+    desc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
+    desc.wait = ZE_EVENT_SCOPE_FLAG_DEVICE;
+    return desc;
+}
+
+ze_event_pool_desc_t event_manager::get_default_event_pool_desc() {
+    ze_event_pool_desc_t desc{ default_event_pool_desc };
+    desc.count = default_pool_size;
+    if (global_data::env().enable_kernel_profile) {
+        desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
+    }
+    return desc;
+}
+
+event_pool* event_manager::add_pool(ze_event_pool_desc_t pool_desc, ze_event_desc_t event_desc) {
+    pools.emplace_back(context, pool_desc, event_desc);
+    return &pools.back();
+}
diff --git a/src/sched/ze/ze_event_manager.hpp b/src/sched/ze/ze_event_manager.hpp
new file mode 100644
index 000000000..101b2b6be
--- /dev/null
+++ b/src/sched/ze/ze_event_manager.hpp
@@ -0,0 +1,98 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <list>
+#include <ze_api.h>
+
+class ccl_stream;
+
+namespace ccl {
+namespace ze {
+
+class event_pool {
+public:
+    event_pool(ze_context_handle_t context);
+    event_pool(ze_context_handle_t context, const ze_event_pool_desc_t& pool_desc);
+    event_pool(ze_context_handle_t context,
+               const ze_event_pool_desc_t& pool_desc,
+               const ze_event_desc_t& event_desc);
+
+    event_pool(const event_pool&) = delete;
+    event_pool(event_pool&&) = default;
+    event_pool& operator=(const event_pool&) = delete;
+    event_pool& operator=(event_pool&&) = delete;
+
+    virtual ~event_pool();
+
+    operator ze_event_pool_handle_t() const;
+
+    ze_event_handle_t create_event();
+    ze_event_handle_t create_event(ze_event_desc_t& desc);
+
+    size_t size() const;
+    size_t capacity() const;
+
+    void reset();
+    void clear();
+
+private:
+    static constexpr size_t worker_idx{};
+
+    const ze_context_handle_t context;
+    const ze_event_pool_desc_t pool_desc;
+    const ze_event_desc_t event_desc;
+
+    ze_event_pool_handle_t pool{};
+    std::list<ze_event_handle_t> events;
+
+    void create_pool();
+};
+
+class event_manager {
+public:
+    event_manager(ze_context_handle_t context);
+    event_manager(const ccl_stream* stream);
+
+    event_manager(const event_manager&) = delete;
+    event_manager(event_manager&&) = default;
+    event_manager& operator=(const event_manager&) = delete;
+    event_manager& operator=(event_manager&&) = default;
+
+    virtual ~event_manager();
+
+    ze_event_handle_t create(ze_event_desc_t desc = get_default_event_desc());
+    std::vector<ze_event_handle_t> create(size_t count,
+                                          ze_event_desc_t desc = get_default_event_desc());
+
+    void reset();
+    void clear();
+
+    static ze_event_desc_t get_default_event_desc();
+    static ze_event_pool_desc_t get_default_event_pool_desc();
+
+protected:
+    static constexpr size_t default_pool_size{ 50 };
+
+    ze_context_handle_t context{};
+    std::list<event_pool> pools;
+
+    event_pool* add_pool(ze_event_pool_desc_t desc = get_default_event_pool_desc(),
+                         ze_event_desc_t event_desc = get_default_event_desc());
+};
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/sched/ze_handle_manager.cpp b/src/sched/ze/ze_handle_manager.cpp
similarity index 81%
rename from src/sched/ze_handle_manager.cpp
rename to src/sched/ze/ze_handle_manager.cpp
index 31d958a55..7c36c9f23 100644
--- a/src/sched/ze_handle_manager.cpp
+++ b/src/sched/ze/ze_handle_manager.cpp
@@ -14,10 +14,9 @@
  limitations under the License.
 */
 #include "common/comm/comm.hpp"
-#include "sched/entry/gpu/ze_call.hpp"
-#include "sched/ze_handle_manager.hpp"
-
-#include <CL/sycl/backend/level_zero.hpp>
+#include "common/global/global.hpp"
+#include "sched/entry/ze/ze_call.hpp"
+#include "sched/ze/ze_handle_manager.hpp"
 
 namespace ccl {
 
@@ -53,26 +52,20 @@ ipc_handle_manager::~ipc_handle_manager() {
 }
 
 void ipc_handle_manager::init(const ccl_comm* init_comm, const ccl_stream* init_stream) {
-    LOG_DEBUG("initialization");
+    LOG_DEBUG("init");
     CCL_THROW_IF_NOT(init_comm, "no comm");
     CCL_THROW_IF_NOT(init_stream, "no stream");
 
     comm = const_cast<ccl_comm*>(init_comm);
 
     for (int idx = 0; idx < comm->size(); idx++) {
-        rank_map.insert({ comm->get_global_rank(idx), idx });
+        rank_map.insert({ comm->get_global_rank(idx, true), idx });
     }
 
-    auto sycl_device = init_stream->get_native_stream().get_device();
-    auto sycl_context = init_stream->get_native_stream().get_context();
-
-    device = sycl_device.template get_native<sycl::backend::level_zero>();
-    context = sycl_context.template get_native<sycl::backend::level_zero>();
-
-    CCL_THROW_IF_NOT(device, "device is not valid");
-    CCL_THROW_IF_NOT(context, "context is not valid");
+    device = init_stream->get_ze_device();
+    context = init_stream->get_ze_context();
 
-    LOG_DEBUG("initialization completed");
+    LOG_DEBUG("init completed");
 }
 
 void ipc_handle_manager::clear() {
@@ -106,7 +99,16 @@ void ipc_handle_manager::clear() {
             if (mem_ptr) {
                 ze_result_t res{};
                 if (mem_type == ipc_mem_type::memory) {
-                    res = zeMemCloseIpcHandle(context, mem_ptr);
+                    // There is a bug in L0 that results in hang in this function
+                    // when we use kernel output event, as a workaround skip it
+                    // if the knob is set
+                    if (ccl::global_data::env().ze_close_ipc_wa) {
+                        LOG_DEBUG("skip zeMemCloseIpcHandle");
+                        res = ZE_RESULT_SUCCESS;
+                    }
+                    else {
+                        res = zeMemCloseIpcHandle(context, mem_ptr);
+                    }
                 }
                 else if (mem_type == ipc_mem_type::pool) {
                     res = zeEventPoolCloseIpcHandle((ze_event_pool_handle_t)mem_ptr);
@@ -152,11 +154,11 @@ void ipc_handle_manager::set(const mem_handle_map_t& handles_arg) {
     LOG_DEBUG("handles are set successfully, size of handles: ", handles.size());
 }
 
-void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm* map_comm) {
+void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm) {
     check_rank(rank, (map_comm) ? map_comm : comm);
     if (map_comm && (map_comm->id() != comm->id())) {
         int old_rank = rank;
-        rank = map_comm->get_global_rank(rank);
+        rank = map_comm->get_global_rank(rank, true);
         auto rank_it = rank_map.find(rank);
         if (rank_it == rank_map.end()) {
             CCL_THROW("handle manager can not handle global rank ", rank);
@@ -178,12 +180,14 @@ void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm
     }
     CCL_THROW_IF_NOT(buf_idx < handles[rank].size(), "buf_idx is not valid value: ", buf_idx);
 
-    const auto& handle_info = handles[rank][buf_idx];
+    auto& handle_info = handles[rank][buf_idx];
     auto handle = handle_info.handle;
-    auto mem_ptr = handle_info.ptr;
+    // Must be a ref so it can be updated when handle is opened
+    auto& mem_ptr = handle_info.ptr;
     auto mem_type = handle_info.type;
 
     LOG_DEBUG("context: ", context, ", device: ", device, ", rank: ", rank, ", buf_idx: ", buf_idx);
+
     if (mem_ptr == nullptr) {
         if (mem_type == ccl::ze::ipc_mem_type::memory) {
             open_handle(handle, &mem_ptr);
@@ -209,9 +213,20 @@ void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm
               " }");
 
     // add offset that we received along with the handle
-    size_t mem_offset = handle_info.offset;
-    void* final_ptr = static_cast<void*>(static_cast<char*>(mem_ptr) + mem_offset);
-    buf.set(final_ptr);
+    if (mem_type == ccl::ze::ipc_mem_type::pool)
+        CCL_THROW_IF_NOT(handle_info.offset == 0, "offsets should be 0 for event pool");
+    return static_cast<void*>(static_cast<char*>(mem_ptr) + handle_info.offset);
+}
+
+void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm* map_comm) {
+    buf.set(get_ptr(rank, buf_idx, map_comm));
+}
+
+void ipc_handle_manager::get(int rank,
+                             size_t buf_idx,
+                             ze_event_pool_handle_t& buf,
+                             ccl_comm* map_comm) {
+    buf = (ze_event_pool_handle_t)get_ptr(rank, buf_idx, map_comm);
 }
 
 void ipc_handle_manager::get_handle(const void* ptr, ze_ipc_mem_handle_t* handle) {
@@ -249,10 +264,14 @@ void ipc_handle_manager::get_address_range(const void* ptr, void** base_ptr, siz
 void ipc_handle_manager::check_rank(int rank, ccl_comm* check_comm) {
     CCL_THROW_IF_NOT(
         (rank >= 0) && (rank < static_cast<int>(handles.size())) && (rank < check_comm->size()),
-        "rank is not valid value: ",
-        rank);
+        "invalid rank: ",
+        rank,
+        ", handles.size: ",
+        handles.size(),
+        ", comm.size: ",
+        check_comm->size());
     CCL_THROW_IF_NOT(
-        rank != check_comm->rank(), "don't expect to open handle for own rank: ", rank);
+        rank != check_comm->rank(), "do not expect to open handle for own rank: ", rank);
 }
 
 } // namespace ze
diff --git a/src/sched/ze_handle_manager.hpp b/src/sched/ze/ze_handle_manager.hpp
similarity index 90%
rename from src/sched/ze_handle_manager.hpp
rename to src/sched/ze/ze_handle_manager.hpp
index 2920e392d..c0f1b28bc 100644
--- a/src/sched/ze_handle_manager.hpp
+++ b/src/sched/ze/ze_handle_manager.hpp
@@ -18,7 +18,7 @@
 #include "common/log/log.hpp"
 #include "common/stream/stream.hpp"
 #include "common/utils/buffer.hpp"
-#include "sched/entry/gpu/ze_primitives.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 
 #include <unordered_map>
 #include <ze_api.h>
@@ -41,6 +41,7 @@ struct ipc_handle_info {
 
     ipc_handle_info();
     ipc_handle_info(const ze_ipc_mem_handle_t& handle, size_t offset, ipc_mem_type type);
+    ipc_handle_info(const ipc_handle_info&) = default;
     ipc_handle_info& operator=(const ipc_handle_info&) = default;
 };
 
@@ -57,7 +58,10 @@ class ipc_handle_manager {
     void clear();
 
     void set(const mem_handle_map_t& handles_arg);
+
+    void* get_ptr(int rank, size_t buf_idx, ccl_comm* map_comm);
     void get(int rank, size_t buf_idx, ccl_buffer& buf, ccl_comm* map_comm = nullptr);
+    void get(int rank, size_t buf_idx, ze_event_pool_handle_t& buf, ccl_comm* map_comm);
 
     void get_handle(const void* buffer, ze_ipc_mem_handle_t* handle);
     void get_handle(ze_event_pool_handle_t pool, ze_ipc_event_pool_handle_t* handle);
diff --git a/src/sched/ze/ze_ipc_event_pool_manager.cpp b/src/sched/ze/ze_ipc_event_pool_manager.cpp
new file mode 100644
index 000000000..7e5381996
--- /dev/null
+++ b/src/sched/ze/ze_ipc_event_pool_manager.cpp
@@ -0,0 +1,56 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "common/log/log.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/ze/ze_ipc_event_pool_manager.hpp"
+
+using namespace ccl;
+using namespace ccl::ze;
+
+void ipc_event_pool_manager::init(const ccl_stream* init_stream) {
+    LOG_DEBUG("init");
+    CCL_THROW_IF_NOT(init_stream, "no stream");
+
+    context = init_stream->get_ze_context();
+    CCL_THROW_IF_NOT(context, "context is not valid");
+    LOG_DEBUG("init completed");
+}
+
+void ipc_event_pool_manager::clear() {
+    for (const auto& pool_info : event_pool_info) {
+        ccl::global_data::get().ze_cache->push(0, context, pool_info.second, pool_info.first);
+    }
+    event_pool_info.clear();
+    LOG_DEBUG("finalize completed");
+}
+
+ze_event_pool_handle_t ipc_event_pool_manager::create(size_t event_count) {
+    CCL_THROW_IF_NOT(context, "context is unavailable");
+
+    ze_event_pool_desc_t event_pool_desc = default_event_pool_desc;
+    event_pool_desc.count = event_count;
+    event_pool_desc.flags = ZE_EVENT_POOL_FLAG_IPC | ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+
+    ze_event_pool_handle_t event_pool{};
+    ccl::global_data::get().ze_cache->get(0, context, event_pool_desc, &event_pool);
+    CCL_THROW_IF_NOT(event_pool, "ipc event pool is unavailable");
+
+    event_pool_info.push_back({ event_pool, event_pool_desc });
+
+    LOG_DEBUG("created manager completed. event_pool_info.size: ", event_pool_info.size());
+    return event_pool;
+}
diff --git a/src/sched/ze/ze_ipc_event_pool_manager.hpp b/src/sched/ze/ze_ipc_event_pool_manager.hpp
new file mode 100644
index 000000000..1d5874312
--- /dev/null
+++ b/src/sched/ze/ze_ipc_event_pool_manager.hpp
@@ -0,0 +1,47 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/stream/stream.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include <ze_api.h>
+
+namespace ccl {
+
+namespace ze {
+
+class ipc_event_pool_manager {
+public:
+    ipc_event_pool_manager() = default;
+    ipc_event_pool_manager(const ipc_event_pool_manager&) = delete;
+    ipc_event_pool_manager& operator=(const ipc_event_pool_manager&) = delete;
+    ~ipc_event_pool_manager() {
+        clear();
+    }
+
+    void init(const ccl_stream* init_stream);
+    void clear();
+
+    ze_event_pool_handle_t create(size_t event_count);
+
+private:
+    ze_context_handle_t context{};
+    std::vector<std::pair<ze_event_pool_handle_t, ze_event_pool_desc_t>> event_pool_info{};
+};
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/sched/ze/ze_list_manager.cpp b/src/sched/ze/ze_list_manager.cpp
new file mode 100644
index 000000000..49917e192
--- /dev/null
+++ b/src/sched/ze/ze_list_manager.cpp
@@ -0,0 +1,139 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/ze/ze_list_manager.hpp"
+
+using namespace ccl;
+using namespace ccl::ze;
+
+list_manager::list_manager(ze_device_handle_t device, ze_context_handle_t context)
+        : device(device),
+          context(context) {
+    LOG_DEBUG("create list manager");
+    CCL_THROW_IF_NOT(device, "no device");
+    CCL_THROW_IF_NOT(context, "no context");
+    get_queues_properties(device, &queue_props);
+
+    // Even if ze_copy_engine != ccl_ze_copy_engine_none,
+    // copy queue can be created with ordinal equal comp queue ordinal,
+    // it can cause deadlock for events between queues on card without blitter engine
+    use_copy_queue =
+        (global_data::env().ze_copy_engine != ccl_ze_copy_engine_none) && (queue_props.size() > 1);
+}
+
+list_manager::list_manager(const ccl_stream* stream)
+        : list_manager(stream->get_ze_device(), stream->get_ze_context()) {}
+
+list_manager::~list_manager() {
+    clear();
+}
+
+ze_command_list_handle_t list_manager::get_comp_list(size_t worker_idx) {
+    if (!comp_list) {
+        comp_queue = create_queue(init_mode::compute, worker_idx);
+        comp_list = create_list(comp_queue, worker_idx);
+    }
+    return comp_list.list;
+}
+
+ze_command_list_handle_t list_manager::get_copy_list(size_t worker_idx) {
+    if (use_copy_queue) {
+        if (!copy_list) {
+            copy_queue = create_queue(init_mode::copy, worker_idx);
+            copy_list = create_list(copy_queue, worker_idx);
+        }
+        return copy_list.list;
+    }
+    return get_comp_list(worker_idx);
+}
+
+queue_info list_manager::create_queue(init_mode mode, size_t worker_idx) {
+    queue_info info{};
+    uint32_t ordinal{}, queue_index{};
+    if (mode == init_mode::copy) {
+        LOG_DEBUG("create copy queue");
+        get_copy_queue_ordinal(device, queue_props, &ordinal);
+    }
+    else {
+        LOG_DEBUG("create comp queue");
+        get_comp_queue_ordinal(device, queue_props, &ordinal);
+    }
+    get_queue_index(queue_props, ordinal, 0, &queue_index);
+
+    info.desc = default_cmd_queue_desc;
+    info.desc.index = queue_index;
+    info.desc.ordinal = ordinal;
+    info.worker_idx = worker_idx;
+
+    global_data::get().ze_cache->get(worker_idx, context, device, info.desc, &info.queue);
+    return info;
+}
+
+void list_manager::free_queue(queue_info& info) {
+    if (!info)
+        return;
+    global_data::get().ze_cache->push(info.worker_idx, context, device, info.desc, info.queue);
+    info.queue = nullptr;
+}
+
+list_info list_manager::create_list(const queue_info& queue, size_t worker_idx) {
+    LOG_DEBUG("create list");
+    list_info info{};
+    info.desc = default_cmd_list_desc;
+    info.desc.commandQueueGroupOrdinal = queue.desc.ordinal;
+    info.worker_idx = worker_idx;
+    global_data::get().ze_cache->get(worker_idx, context, device, info.desc, &info.list);
+    return info;
+}
+
+void list_manager::free_list(list_info& info) {
+    if (!info)
+        return;
+    global_data::get().ze_cache->push(info.worker_idx, context, device, info.desc, info.list);
+    info.list = nullptr;
+}
+
+void list_manager::clear() {
+    LOG_DEBUG("destroy lists and queues");
+    free_list(comp_list);
+    free_list(copy_list);
+    free_queue(comp_queue);
+    free_queue(copy_queue);
+}
+
+bool list_manager::can_use_copy_queue() const {
+    return use_copy_queue;
+}
+
+void list_manager::execute_list(queue_info& queue, list_info& list) {
+    if (list.list) {
+        if (!list.is_closed) {
+            ZE_CALL(zeCommandListClose, (list.list));
+            list.is_closed = true;
+        }
+        ZE_CALL(zeCommandQueueExecuteCommandLists, (queue.queue, 1, &list.list, nullptr));
+    }
+}
+
+void list_manager::execute() {
+    if (use_copy_queue) {
+        LOG_DEBUG("execute copy list");
+        execute_list(copy_queue, copy_list);
+    }
+    LOG_DEBUG("execute comp list");
+    execute_list(comp_queue, comp_list);
+}
diff --git a/src/sched/ze/ze_list_manager.hpp b/src/sched/ze/ze_list_manager.hpp
new file mode 100644
index 000000000..55b3691a8
--- /dev/null
+++ b/src/sched/ze/ze_list_manager.hpp
@@ -0,0 +1,91 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/ze/ze_primitives.hpp"
+
+namespace ccl {
+namespace ze {
+
+struct list_info {
+    list_info() = default;
+
+    explicit operator bool() const {
+        return list != nullptr;
+    }
+
+    ze_command_list_handle_t list{};
+    ze_command_list_desc_t desc{};
+    bool is_closed{};
+    ssize_t worker_idx{ -1 };
+};
+
+struct queue_info {
+    queue_info() = default;
+
+    explicit operator bool() const {
+        return queue != nullptr;
+    }
+
+    ze_command_queue_handle_t queue{};
+    ze_command_queue_desc_t desc{};
+    ssize_t worker_idx{ -1 };
+};
+
+class list_manager {
+public:
+    list_manager() = delete;
+    explicit list_manager(ze_device_handle_t device, ze_context_handle_t context);
+    explicit list_manager(const ccl_stream* stream);
+    list_manager(const list_manager&) = delete;
+    explicit list_manager(list_manager&&) = default;
+    ~list_manager();
+
+    void execute();
+
+    ze_command_list_handle_t get_comp_list(size_t worker_idx = 0);
+    ze_command_list_handle_t get_copy_list(size_t worker_idx = 0);
+
+    void clear();
+
+    bool can_use_copy_queue() const;
+
+private:
+    const ze_device_handle_t device;
+    const ze_context_handle_t context;
+
+    list_info comp_list{};
+    list_info copy_list{};
+
+    ze_queue_properties_t queue_props;
+
+    queue_info comp_queue{};
+    queue_info copy_queue{};
+
+    bool use_copy_queue{};
+
+    // not thread safe. It is supposed to be called from a single threaded builder
+    queue_info create_queue(init_mode mode, size_t worker_idx);
+    void free_queue(queue_info& info);
+
+    list_info create_list(const queue_info& info, size_t worker_idx);
+    void free_list(list_info& info);
+
+    void execute_list(queue_info& queue, list_info& list);
+};
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/supported_topologies.hpp b/src/supported_topologies.hpp
index 8c308d3c9..f8ac85c5c 100644
--- a/src/supported_topologies.hpp
+++ b/src/supported_topologies.hpp
@@ -15,24 +15,13 @@
 */
 #pragma once
 
-#include "oneapi/ccl/type_traits.hpp"
-#include "oneapi/ccl/comm_split_attr_ids.hpp"
 #include "common/utils/enums.hpp"
 #include "internal_types.hpp"
 
-namespace ccl {
-
-#define SUPPORTED_HW_TOPOLOGIES_DECL_LIST \
-    ccl::group_split_type::thread, ccl::group_split_type::process, ccl::group_split_type::cluster
-
-#define SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST \
-    ccl::device_topology_type::ring, ccl::device_topology_type::a2a
-} // namespace ccl
-
-using device_group_split_type_names =
-    ::utils::enum_to_str<static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
-        ccl::group_split_type::last_value)>;
 inline std::string to_string(ccl::group_split_type type) {
+    using device_group_split_type_names = ::utils::enum_to_str<
+        static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
+            ccl::group_split_type::last_value)>;
     return device_group_split_type_names({
                                              "TG",
                                              "PG",
@@ -40,10 +29,3 @@ inline std::string to_string(ccl::group_split_type type) {
                                          })
         .choose(type, "INVALID_VALUE");
 }
-
-using device_topology_type_names =
-    ::utils::enum_to_str<ccl::device_topology_type::last_class_value>;
-inline std::string to_string(ccl::device_topology_type class_value) {
-    return device_topology_type_names({ "RING_CLASS", "A2A_CLASS" })
-        .choose(class_value, "INVALID_VALUE");
-}
diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp
index ade72119e..3c66554b1 100644
--- a/src/unordered_coll/unordered_coll.cpp
+++ b/src/unordered_coll/unordered_coll.cpp
@@ -33,8 +33,9 @@ ccl_unordered_coll_manager::ccl_unordered_coll_manager(ccl_comm& parent_comm) {
         new ccl_comm(parent_comm.rank(),
                      parent_comm.size(),
                      ccl::global_data::get().comm_ids->acquire(true /*internal_id_space*/),
-                     parent_comm.atl,
-                     true /*share_resources*/));
+                     parent_comm.get_atl_comm(),
+                     true /*share_resources*/,
+                     true /* is_sub_communicator */));
 
     CCL_ASSERT(coordination_comm.get(), "coordination_comm is null");
 
@@ -157,13 +158,13 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     CCL_THROW_IF_NOT(!match_id.empty(), "match_id is empty");
 
     ccl_coll_param coll_param{};
-    coll_param.ctype = ccl_coll_internal;
+    coll_param.ctype = ccl_coll_undefined;
     coll_param.dtype = ccl_datatype_int8;
     coll_param.comm = coordination_comm.get();
 
-    std::unique_ptr<ccl_extra_sched> service_sched(
-        new ccl_extra_sched(coll_param, coordination_comm->get_sched_id(true)));
-    service_sched->internal_type = ccl_sched_internal_unordered_coll;
+    std::unique_ptr<ccl_extra_sched> service_sched(new ccl_extra_sched(
+        { ccl_sched_unordered_coll, coordination_comm->get_sched_id(true), coll_param }));
+
     if (ccl::global_data::env().priority_mode == ccl_priority_lifo) {
         service_sched->coll_attr.priority = ccl_sched_base::get_lifo_priority();
     }
@@ -202,7 +203,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     match_id_size_param.dtype = ccl_datatype_int8;
     match_id_size_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     match_id_size_param.comm = coll_param.comm;
-    entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_size_param);
+    entry_factory::create<coll_entry>(service_sched.get(), match_id_size_param);
 
     service_sched->add_barrier();
 
@@ -214,7 +215,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     match_id_val_param.dtype = ccl_datatype_int8;
     match_id_val_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     match_id_val_param.comm = coll_param.comm;
-    auto entry = entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_val_param);
+    auto entry = entry_factory::create<coll_entry>(service_sched.get(), match_id_val_param);
 
     entry->set_field_fn<ccl_sched_entry_field_recv_buf>(
         [](const void* fn_ctx, void* field_ptr) {
@@ -249,12 +250,12 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     reserved_comm_id_param.dtype = ccl_datatype_int8;
     reserved_comm_id_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     reserved_comm_id_param.comm = coll_param.comm;
-    entry_factory::make_entry<coll_entry>(service_sched.get(), reserved_comm_id_param);
+    entry_factory::create<coll_entry>(service_sched.get(), reserved_comm_id_param);
 
     service_sched->add_barrier();
 
     /* 4. start post actions (create communicator and start postponed schedules) */
-    entry_factory::make_entry<function_entry>(
+    entry_factory::create<function_entry>(
         service_sched.get(),
         [](const void* func_ctx) -> ccl::status {
             auto ctx = static_cast<ccl_unordered_coll_ctx*>(const_cast<void*>(func_ctx));
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index d6ba5177b..9cb8219f8 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -79,14 +79,14 @@ if (${CMAKE_VERSION} VERSION_LESS 3.1)
 endif()
 
 #common release/debug compilation settings
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${C_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${C_COMPILER_FLAGS} -O3")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} ${C_COMPILER_FLAGS} -O2 -g")
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED ON)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Werror -D_GNU_SOURCE -fvisibility=internal")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMPILER_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror -D_GNU_SOURCE -fvisibility=internal")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${CXX_COMPILER_FLAGS} -O0 -g -DENABLE_DEBUG")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_COMPILER_FLAGS} -O3")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMPILER_FLAGS} -O2 -g")
@@ -116,11 +116,10 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    # w/a for ats with 2 mpi lib, should be fixed
     if (DEFINED ENV{I_MPI_ROOT})
         set(I_MPI_ROOT "$ENV{I_MPI_ROOT}")
     endif()
-    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release_mt/)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
     target_link_libraries(${executable} PUBLIC mpi)
     install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL)
     add_test (NAME ${executable} CONFIGURATIONS default COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml)
@@ -133,21 +132,21 @@ endforeach()
 add_test (NAME allreduce_fusion CONFIGURATIONS allreduce_fusion COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_fusion_report.junit.xml)
 
 foreach(ppn 1; 2)
-    foreach(algo direct; rabenseifner; starlike; ring; ring_rma; double_tree; recursive_doubling; 2d; topo_ring)
+    foreach(algo direct; rabenseifner; nreduce; ring; ring_rma; double_tree; recursive_doubling; 2d; topo)
         add_test (NAME allreduce_${algo}_${ppn} CONFIGURATIONS allreduce_${algo}_${ppn} COMMAND mpiexec.hydra -l -n 2 -ppn ${ppn} ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_${algo}_${ppn}_report.junit.xml)
     endforeach()
 
-    foreach(algo direct; ring; double_tree; naive; topo_ring)
+    foreach(algo direct; ring; double_tree; naive; topo)
         add_test (NAME bcast_${algo}_${ppn} CONFIGURATIONS bcast_${algo}_${ppn} COMMAND mpiexec.hydra -l -n 2 -ppn ${ppn} ${CCL_INSTALL_TESTS}/bcast_test --gtest_output=xml:${CCL_INSTALL_TESTS}/bcast_${algo}_${ppn}_report.junit.xml)
     endforeach()
 
-    foreach(algo direct; rabenseifner; tree; double_tree; topo_ring)
+    foreach(algo direct; rabenseifner; tree; double_tree; topo)
         add_test (NAME reduce_${algo}_${ppn} CONFIGURATIONS reduce_${algo}_${ppn} COMMAND mpiexec.hydra -l -n 2 -ppn ${ppn} ${CCL_INSTALL_TESTS}/reduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/reduce_${algo}_${ppn}_report.junit.xml)
     endforeach()
 
 endforeach()
 
-foreach(algo starlike; ring; 2d)
+foreach(algo nreduce; ring; 2d)
 add_test (NAME allreduce_${algo}_chunked CONFIGURATIONS allreduce_${algo}_chunked COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/allreduce_test --gtest_output=xml:${CCL_INSTALL_TESTS}/allreduce_${algo}_chunked_report.junit.xml)
 endforeach()
 
diff --git a/tests/functional/lp.cpp b/tests/functional/lp.cpp
index 2d07bf156..afda31974 100644
--- a/tests/functional/lp.cpp
+++ b/tests/functional/lp.cpp
@@ -46,8 +46,8 @@ int is_bf16_enabled() {
         __asm__ __volatile__("cpuid"
                              : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                              : "a"(7), "c"(0));
-        is_bf16_enabled = ((reg[1] & (1 << 16)) >> 16) & ((reg[1] & (1 << 30)) >> 30) &
-                          ((reg[1] & (1 << 31)) >> 31);
+        is_bf16_enabled = ((reg[1] & (1u << 16)) >> 16) & ((reg[1] & (1u << 30)) >> 30) &
+                          ((reg[1] & (1u << 31)) >> 31);
     }
     return is_bf16_enabled;
 #else
@@ -64,7 +64,7 @@ int is_avx512bf_enabled() {
         __asm__ __volatile__("cpuid"
                              : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                              : "a"(7), "c"(1));
-        is_avx512bf_enabled = (reg[0] & (1 << 5)) >> 5;
+        is_avx512bf_enabled = (reg[0] & (1u << 5)) >> 5;
     }
     return is_avx512bf_enabled;
 #else
diff --git a/tests/functional/lp_impl.hpp b/tests/functional/lp_impl.hpp
index d7b6f5173..fbfb2c307 100644
--- a/tests/functional/lp_impl.hpp
+++ b/tests/functional/lp_impl.hpp
@@ -65,7 +65,7 @@ template <typename T>
 void make_lp_epilogue(test_operation<T>& op, size_t count) {
     ccl_data_type dtype = op.param.datatype;
     for (size_t buf_idx = 0; buf_idx < op.buffer_count; buf_idx++) {
-        std::vector<T> tmp(op.recv_bufs[buf_idx]);
+        std::vector<T> tmp(op.recv_bufs[buf_idx].begin(), op.recv_bufs[buf_idx].end());
         convert_lp_to_fp32_arrays((short*)tmp.data(), op.recv_bufs[buf_idx].data(), count, dtype);
     }
 }
diff --git a/tests/functional/test.hpp b/tests/functional/test.hpp
index 8adbf3422..a1cfc95f7 100644
--- a/tests/functional/test.hpp
+++ b/tests/functional/test.hpp
@@ -42,12 +42,14 @@ struct test_operation {
 
     std::vector<size_t> buf_indexes;
 
-    std::vector<std::vector<T>> send_bufs;
-    std::vector<std::vector<T>> recv_bufs;
-
 #ifdef CCL_ENABLE_SYCL
+    std::vector<aligned_vector<T>> send_bufs;
+    std::vector<aligned_vector<T>> recv_bufs;
     std::vector<void*> device_send_bufs;
     std::vector<void*> device_recv_bufs;
+#else // CCL_ENABLE_SYCL
+    std::vector<std::vector<T>> send_bufs;
+    std::vector<std::vector<T>> recv_bufs;
 #endif // CCL_ENABLE_SYCL
 
     std::vector<ccl::event> events;
diff --git a/tests/functional/test_impl.hpp b/tests/functional/test_impl.hpp
index e3b3b68d9..97d688aa9 100644
--- a/tests/functional/test_impl.hpp
+++ b/tests/functional/test_impl.hpp
@@ -337,8 +337,14 @@ void base_test<T>::change_buffers(test_operation<T>& op) {
             as result buffers in updated vector will have original content
             but in new memory locations
         */
-        std::vector<std::vector<T>>(op.send_bufs.begin(), op.send_bufs.end()).swap(op.send_bufs);
-        std::vector<std::vector<T>>(op.recv_bufs.begin(), op.recv_bufs.end()).swap(op.recv_bufs);
+#ifdef CCL_ENABLE_SYCL
+        using vector_t = aligned_vector<T>;
+#else // CCL_ENABLE_SYCL
+        using vector_t = std::vector<T>;
+#endif // CCL_ENABLE_SYCL
+
+        std::vector<vector_t>(op.send_bufs.begin(), op.send_bufs.end()).swap(op.send_bufs);
+        std::vector<vector_t>(op.recv_bufs.begin(), op.recv_bufs.end()).swap(op.recv_bufs);
         void* new_send_buf = op.send_bufs[0].data();
         void* new_recv_buf = op.recv_bufs[0].data();
         ASSERT(send_buf != new_send_buf, "send buffers should differ");
diff --git a/tests/functional/transport.hpp b/tests/functional/transport.hpp
index 12b9ef478..73303b44e 100644
--- a/tests/functional/transport.hpp
+++ b/tests/functional/transport.hpp
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "base.hpp"
+#include "base_utils.hpp"
 #include "oneapi/ccl.hpp"
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_base.hpp"
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 27c2c65f6..4ddf88e91 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,5 +1,5 @@
 Intel(R) oneAPI Collective Communications Library (oneCCL) 
-2021.4.0 Third Party Programs File
+2021.5.0 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -10,83 +10,81 @@ terms are listed below.
 
 1. Intel(R) MPI Library
 
-   Copyright (c) 2020 Intel Corporation.
+   Copyright (c) Intel Corporation.
 
 
-   Intel Simplified Software License (Version February 2020)
+   Intel Simplified Software License (Version August 2021)
 
-Use and Redistribution. You may use and redistribute the software (the 
+Use and Redistribution. You may use and redistribute the software (the
 "Software"), without modification, provided the following conditions are met:
 
-* Redistributions must reproduce the above copyright notice and the following 
-  terms of use in the Software and in the documentation and/or other materials 
+* Redistributions must reproduce the above copyright notice and the following
+  terms of use in the Software and in the documentation and/or other materials
   provided with the distribution.
 * Neither the name of Intel nor the names of its suppliers may be used to 
-  endorse or promote products derived from this Software without specific prior 
-  written permission.
-* No reverse engineering, decompilation, or disassembly of this Software is 
+  endorse or promote products derived from this Software without specific  
+  prior written permission.
+* No reverse engineering, decompilation, or disassembly of this Software is
   permitted.
 
-Limited patent license. Intel grants you a world-wide, royalty-free, 
-non-exclusive license under patents it now or hereafter owns or controls to 
-make, have made, use, import, offer to sell and sell ("Utilize") this Software, 
-but solely to the extent that any such patent is necessary to Utilize the 
-Software alone. The patent license shall not apply to any combinations which 
-include this software. No hardware per se is licensed hereunder.
-
-Third party programs. The Software may contain Third Party Programs. "Third 
-Party Programs" are third party software, open source software or other Intel 
-software listed in the "third-party-programs.txt"  or other similarly named text 
-file that is included with the Software. Third Party Programs, even if included 
-with the distribution of the Software, may be governed by separate license 
-terms, including without limitation, third party license terms, open source 
-software notices and terms, and/or other Intel software license terms. These 
-separate license terms may govern your use of the Third Party Programs.  
-
-DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE 
-DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS 
-WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE 
-THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND 
-ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT 
-INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS.
-
-LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, 
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD 
-INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR 
-UNAUTHORIZED USE OF THE SOFTWARE.
-
-No support. Intel may make changes to the Software, at any time without notice, 
-and is not obligated to support, update or provide training for the Software. 
-
-Termination. Intel may terminate your right to use the Software in the event of 
-your breach of this Agreement and you fail to cure the breach within a 
-reasonable period of time.
-
-Feedback. Should you provide Intel with comments, modifications, corrections, 
-enhancements or other input ("Feedback") related to the Software Intel will be 
-free to use, disclose, reproduce, license or otherwise distribute or exploit the 
-Feedback in its sole discretion without any obligations or restrictions of any 
-kind, including without limitation, intellectual property rights or licensing 
+No other licenses. Except as provided in the preceding section, Intel grants no
+licenses or other rights by implication, estoppel or otherwise to, patent,
+copyright, trademark, trade name, service mark or other intellectual property
+licenses or rights of Intel.
+
+Third party software. The Software may contain Third Party Software. "Third
+Party Software" is open source software, third party software, or other Intel
+software that may be identified in the Software itself or in the files (if any)
+listed in the "third-party-software.txt" or similarly named text file included
+with the Software. Third Party Software, even if included with the distribution
+of the Software, may be governed by separate license terms, including without
+limitation, open source software license terms, third party software license
+terms, and other Intel software license terms. Those separate license terms
+solely govern your use of the Third Party Software, and nothing in this license
+limits any rights under, or grants rights that supersede, the terms of the
+applicable license terms.
+
+DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE
+DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS
+WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE
+THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND
+ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT
+INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE SOFTWARE.
+
+LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+No support. Intel may make changes to the Software, at any time without notice,
+and is not obligated to support, update or provide training for the Software.
+
+Termination. Your right to use the Software is terminated in the event of your
+breach of this license.
+
+Feedback. Should you provide Intel with comments, modifications, corrections,
+enhancements or other input ("Feedback") related to the Software, Intel will be
+free to use, disclose, reproduce, license or otherwise distribute or exploit the
+Feedback in its sole discretion without any obligations or restrictions of any
+kind, including without limitation, intellectual property rights or licensing
 obligations.
 
-Compliance with laws. You agree to comply with all relevant laws and regulations 
-governing your use, transfer, import or export (or prohibition thereof) of the 
+Compliance with laws. You agree to comply with all relevant laws and regulations
+governing your use, transfer, import or export (or prohibition thereof) of the
 Software.
 
-Governing law. All disputes will be governed by the laws of the United States of 
-America and the State of Delaware without reference to conflict of law 
-principles and subject to the exclusive jurisdiction of the state or federal 
-courts sitting in the State of Delaware, and each party agrees that it submits 
-to the personal jurisdiction and venue of those courts and waives any 
-objections. The United Nations Convention on Contracts for the International 
-Sale of Goods (1980) is specifically excluded and will not apply to the 
+Governing law. All disputes will be governed by the laws of the United States of
+America and the State of Delaware without reference to conflict of law
+principles and subject to the exclusive jurisdiction of the state or federal
+courts sitting in the State of Delaware, and each party agrees that it submits
+to the personal jurisdiction and venue of those courts and waives any
+objections. The United Nations Convention on Contracts for the International
+Sale of Goods (1980) is specifically excluded and will not apply to the
 Software.
 
 -------------------------------------------------------------------------------